├── tests
    ├── __init__.py
    ├── test_matcher.py
    └── test_dframcy.py
├── requirements.txt
├── .gitignore
├── .travis.yml
├── MANIFEST.in
├── dframcy
    ├── __init__.py
    ├── cli.py
    ├── utils.py
    ├── dframcy.py
    └── matcher.py
├── tox.ini
├── .coveragerc
├── LICENSE
├── setup.py
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | wasabi
3 | pytest
4 | pytest-cov
5 | tox
6 | tox-travis
7 | pandas
8 | xlrd
9 | spacy>=3.0.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .idea/
 3 | .coverage
 4 | dframcy.egg-info/
 5 | build/
 6 | dist/
 7 | htmlcov/
 8 | .tox/
 9 | coverage.xml
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | dist: bionic
3 | python:
4 |   - "3.6"
5 | install:
6 |   - pip install -r requirements.txt
7 |   - pip install .
8 | script:
9 |   - tox -vv


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | include setup.cfg
5 | include data/*
6 | 
7 | global-exclude __pycache__
8 | global-exclude *.py[co]
9 | global-exclude .*


--------------------------------------------------------------------------------
/dframcy/__init__.py:
--------------------------------------------------------------------------------
1 | from dframcy.dframcy import DframCy
2 | from dframcy.matcher import DframCyMatcher, DframCyPhraseMatcher
3 | 
4 | __version__ = "0.1.6"
5 | __all__ = ["DframCy", "DframCyMatcher", "DframCyPhraseMatcher"]
6 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{36}
 3 | skip_missing_interpreters = True
 4 | [testenv]
 5 | passenv =
 6 |     CI
 7 |     TRAVIS
 8 |     TRAVIS_*
 9 | deps =
10 |     codecov
11 | commands =
12 |     python3 -m spacy download en
13 |     py.test --cov-report=xml --cov=dframcy tests/
14 |     codecov -e TOXENV


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source =
 3 |     dframcy
 4 |     tests
 5 | branch = False
 6 | omit = setup.py
 7 |     dframcy/cli.py
 8 | 
 9 | [report]
10 | show_missing = True
11 | exclude_lines =
12 |     pragma: no cover
13 |     def __repr__
14 |     RuntimeError
15 |     NotImplementedError
16 |     FileNotFoundError
17 |     ImportError
18 |     KeyError
19 |     IOError
20 |     if __name__ == .__main__.:
21 |     messenger


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2019 YASH
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | from io import open
 3 | from setuptools import setup, find_packages
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 8 |     long_description = f.read()
 9 | 
10 | with open(path.join(here, "requirements.txt"), encoding="utf-8") as f:
11 |     REQUIRES = [line.strip("\n") for line in f]
12 | 
13 | setup(
14 |     name="dframcy",
15 |     version="0.1.6",
16 |     description="Pandas Dataframe integration for spaCy",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     url="https://github.com/yash1994/dframcy",
20 |     author="Yash Patadia",
21 |     author_email="yash@patadia.org",
22 |     classifiers=[
23 |         "Development Status :: 3 - Alpha",
24 |         "Intended Audience :: Developers",
25 |         "Topic :: Software Development :: Build Tools",
26 |         "License :: OSI Approved :: MIT License",
27 |         "Programming Language :: Python :: 3.6",
28 |     ],
29 |     keywords=["spacy", "dataframe", "pandas"],
30 |     packages=find_packages(),
31 |     install_requires=REQUIRES,
32 |     tests_require=["pytest"],
33 |     entry_points={
34 |         "console_scripts": ["dframcy=dframcy.cli:main"],
35 |     },
36 | )
37 | 


--------------------------------------------------------------------------------
/dframcy/cli.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import json
  5 | import click
  6 | import spacy
  7 | from io import open
  8 | from pathlib import Path
  9 | from wasabi import Printer
 10 | from .dframcy import DframCy
 11 | from .utils import get_default_columns
 12 | 
 13 | messenger = Printer()
 14 | DEFAULT_COLUMNS = ",".join(get_default_columns())
 15 | 
 16 | 
 17 | @click.group()
 18 | def main():
 19 |     pass
 20 | 
 21 | 
 22 | @main.command()
 23 | @click.option(
 24 |     "--input_file", "-i", required=True, type=Path, help="Input text file path."
 25 | )
 26 | @click.option("--output_file", "-o", required=True, type=Path, help="Output file path.")
 27 | @click.option(
 28 |     "--format",
 29 |     "-f",
 30 |     default="csv",
 31 |     show_default=True,
 32 |     type=click.Choice(["json", "csv"], case_sensitive=False),
 33 |     help="Output file format (json/csv)",
 34 | )
 35 | @click.option(
 36 |     "--language_model",
 37 |     "-l",
 38 |     default="en_core_web_sm",
 39 |     show_default=True,
 40 |     type=str,
 41 |     help="Language model to be used.",
 42 | )
 43 | @click.option(
 44 |     "--columns",
 45 |     "-c",
 46 |     default=DEFAULT_COLUMNS,
 47 |     show_default=True,
 48 |     type=str,
 49 |     help="Annotations to be included in dataframe.",
 50 | )
 51 | @click.option(
 52 |     "--separate_entity_frame",
 53 |     "-s",
 54 |     default=False,
 55 |     show_default=True,
 56 |     type=bool,
 57 |     help="Save separate entity dataframe.",
 58 | )
 59 | def dframe(
 60 |     input_file, output_file, format, language_model, columns, separate_entity_frame
 61 | ):
 62 |     format = format.lower()
 63 | 
 64 |     if output_file.is_dir():
 65 |         output_file = output_file.joinpath(input_file.stem + "." + str(format))
 66 |     if input_file.exists():
 67 |         with open(input_file, "r") as infile:
 68 |             text = infile.read().strip("\n").strip()
 69 | 
 70 |         nlp = spacy.load(language_model)
 71 |         dframcy = DframCy(nlp)
 72 |         doc = nlp(text)
 73 | 
 74 |         annotation_dataframe = dframcy.to_dataframe(
 75 |             doc=doc,
 76 |             columns=columns.split(","),
 77 |             separate_entity_dframe=separate_entity_frame,
 78 |         )
 79 | 
 80 |         if separate_entity_frame:
 81 |             token_annotation_dataframe, entity_dataframe = annotation_dataframe
 82 |         else:
 83 |             token_annotation_dataframe = annotation_dataframe
 84 |             entity_dataframe = None
 85 | 
 86 |         if format == "csv":
 87 |             token_annotation_dataframe.to_csv(output_file)
 88 |             if separate_entity_frame:
 89 |                 entity_output_file = Path(
 90 |                     str(output_file).strip(".csv") + "_entity.csv"
 91 |                 )
 92 |                 entity_dataframe.to_csv(entity_output_file)
 93 |         elif format == "json":
 94 |             annotation_json = token_annotation_dataframe.to_json(orient="columns")
 95 |             json.dump(annotation_json, open(output_file, "w"))
 96 |             if separate_entity_frame:
 97 |                 entity_output_file = Path(
 98 |                     str(output_file).strip(".json") + "_entity.json"
 99 |                 )
100 |                 json.dump(entity_dataframe, open(entity_output_file, "w"))
101 |     else:
102 |         messenger.fail(
103 |             "input file path: {} does not exist".format(input_file), exits=-1
104 |         )
105 | 


--------------------------------------------------------------------------------
/dframcy/utils.py:
--------------------------------------------------------------------------------
  1 | from wasabi import Printer
  2 | 
  3 | messenger = Printer()
  4 | 
  5 | 
  6 | def get_default_columns():
  7 |     """
  8 |     Default columns for dataframe
  9 |     :return: list of default attributes
 10 |     """
 11 |     return ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"]
 12 | 
 13 | 
 14 | def get_spacy_token_class_config():
 15 |     """
 16 |     Configuration of spacy's Token class attribute
 17 |     :return: config dictionary of attributes/properties
 18 |     """
 19 |     token_config = {
 20 |         "PROPERTIES": [
 21 |             "lefts",
 22 |             "rights",
 23 |             "n_lefts",
 24 |             "subtree",
 25 |             "children",
 26 |             "n_rights",
 27 |             "ancestors",
 28 |             "conjuncts",
 29 |             "has_vector",
 30 |             "is_sent_start",
 31 |         ],
 32 |         "ATTRIBUTES": [
 33 |             "text",
 34 |             "head",
 35 |             "pos_",
 36 |             "tag_",
 37 |             "dep_",
 38 |             "orth_",
 39 |             "norm_",
 40 |             "lang_",
 41 |             "lemma_",
 42 |             "lower_",
 43 |             "shape_",
 44 |             "is_oov",
 45 |             "ent_id_",
 46 |             "prefix_",
 47 |             "suffix_",
 48 |             "is_stop",
 49 |             "ent_iob_",
 50 |             "is_alpha",
 51 |             "is_ascii",
 52 |             "is_digit",
 53 |             "is_lower",
 54 |             "is_upper",
 55 |             "is_title",
 56 |             "is_punct",
 57 |             "is_space",
 58 |             "is_quote",
 59 |             "like_url",
 60 |             "like_num",
 61 |             "left_edge",
 62 |             "ent_type_",
 63 |             "right_edge",
 64 |             "ent_kb_id_",
 65 |             "is_bracket",
 66 |             "like_email",
 67 |             "is_currency",
 68 |             "is_left_punct",
 69 |             "is_right_punct",
 70 |         ],
 71 |         "ADDITIONAL_ATTRIBUTES": ["id", "start", "end"],
 72 |         "INT_FORMAT_ATTRIBUTES": [
 73 |             "pos",
 74 |             "tag",
 75 |             "dep",
 76 |             "orth",
 77 |             "norm",
 78 |             "lang",
 79 |             "lower",
 80 |             "shape",
 81 |             "ent_id",
 82 |             "prefix",
 83 |             "suffix",
 84 |             "ent_iob",
 85 |             "ent_type",
 86 |         ],
 87 |     }
 88 | 
 89 |     return token_config
 90 | 
 91 | 
 92 | def check_columns_consistency(columns):
 93 |     """
 94 |     Checks consistency of column names passed by users
 95 |     with spacy's Token class.
 96 |     :param columns: list of column names
 97 |     :return: list of consistent column names
 98 |     """
 99 |     spacy_token_config = get_spacy_token_class_config()
100 |     consistent_column_names = []
101 |     for column_name in columns:
102 |         if column_name in spacy_token_config["PROPERTIES"]:
103 |             consistent_column_names.append((column_name, "property"))
104 |         elif column_name in spacy_token_config["ATTRIBUTES"]:
105 |             consistent_column_names.append((column_name, "attribute"))
106 |         elif column_name in spacy_token_config["ADDITIONAL_ATTRIBUTES"]:
107 |             consistent_column_names.append((column_name, "additional_attribute"))
108 |         elif column_name in spacy_token_config["INT_FORMAT_ATTRIBUTES"]:
109 |             consistent_column_names.append((column_name, "int_format_attribute"))
110 |         else:
111 |             messenger.warn(
112 |                 "Column name '{}' not consistent with spacy's Token class".format(
113 |                     column_name
114 |                 )
115 |             )
116 | 
117 |     return consistent_column_names
118 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DframCy
  2 | 
  3 | [![Package Version](https://img.shields.io/pypi/v/dframcy.svg?&service=github)](https://pypi.python.org/pypi/dframcy/)
  4 | [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
  5 | [![Build Status](https://travis-ci.org/yash1994/dframcy.svg?branch=master)](https://travis-ci.org/yash1994/dframcy) 
  6 | [![codecov](https://codecov.io/gh/yash1994/dframcy/branch/master/graph/badge.svg)](https://codecov.io/gh/yash1994/dframcy)
  7 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
  8 | 
  9 | DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks. DframCy provides clean APIs to convert spaCy's linguistic annotations, Matcher and PhraseMatcher information to Pandas dataframe, also supports training and evaluation of NLP pipeline from CSV/XLXS/XLS without any changes to spaCy's underlying APIs.
 10 | 
 11 | ## Getting Started
 12 | 
 13 | DframCy can be easily installed. Just need to the following:
 14 | 
 15 | ### Requirements
 16 | 
 17 | * Python 3.6 or later
 18 | * Pandas
 19 | * spaCy >= 3.0.0
 20 | 
 21 | Also need to download spaCy's language model:
 22 | 
 23 | ```bash
 24 | python -m spacy download en_core_web_sm
 25 | ```
 26 | 
 27 | For more information refer to: [Models & Languages](https://spacy.io/usage/models)
 28 | 
 29 | ### Installation:
 30 | 
 31 | This package can be installed from [PyPi](https://pypi.org/project/dframcy/) by running:
 32 | 
 33 | ```bash
 34 | pip install dframcy
 35 | ```
 36 | 
 37 | To build from source:
 38 | 
 39 | ```bash
 40 | git clone https://github.com/yash1994/dframcy.git
 41 | cd dframcy
 42 | python setup.py install
 43 | ```
 44 | 
 45 | ## Usage
 46 | 
 47 | ### Linguistic Annotations
 48 | 
 49 | Get linguistic annotation in the dataframe. For linguistic annotations (dataframe column names) refer to [spaCy's Token API](https://spacy.io/api/token) document.
 50 | 
 51 | ```python
 52 | import spacy
 53 | from dframcy import DframCy
 54 | 
 55 | nlp = spacy.load("en_core_web_sm")
 56 | 
 57 | dframcy = DframCy(nlp)
 58 | doc = dframcy.nlp(u"Apple is looking at buying U.K. startup for $1 billion")
 59 | 
 60 | # default columns: ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"]
 61 | annotation_dataframe = dframcy.to_dataframe(doc)
 62 | 
 63 | # can also pass columns names (spaCy's linguistic annotation attributes)
 64 | annotation_dataframe = dframcy.to_dataframe(doc, columns=["text", "lemma_", "lower_", "is_punct"])
 65 | 
 66 | # for separate entity dataframe
 67 | token_annotation_dataframe, entity_dataframe = dframcy.to_dataframe(doc, separate_entity_dframe=True)
 68 | 
 69 | # custom attributes can also be included
 70 | from spacy.tokens import Token
 71 | fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
 72 | Token.set_extension("is_fruit", getter=fruit_getter)
 73 | doc = dframcy.nlp(u"I have an apple")
 74 | 
 75 | annotation_dataframe = dframcy.to_dataframe(doc, custom_attributes=["is_fruit"])
 76 | ```
 77 | 
 78 | ### Rule-Based Matching
 79 | 
 80 | ```python
 81 | # Token-based Matching
 82 | import spacy
 83 | 
 84 | nlp = spacy.load("en_core_web_sm")
 85 | 
 86 | from dframcy.matcher import DframCyMatcher, DframCyPhraseMatcher, DframCyDependencyMatcher
 87 | dframcy_matcher = DframCyMatcher(nlp)
 88 | pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
 89 | dframcy_matcher.add("HelloWorld", [pattern])
 90 | doc = dframcy_matcher.nlp("Hello, world! Hello world!")
 91 | matches_dataframe = dframcy_matcher(doc)
 92 | 
 93 | # Phrase Matching
 94 | dframcy_phrase_matcher = DframCyPhraseMatcher(nlp)
 95 | terms = [u"Barack Obama", u"Angela Merkel",u"Washington, D.C."]
 96 | patterns = [dframcy_phrase_matcher.nlp.make_doc(text) for text in terms]
 97 | dframcy_phrase_matcher.add("TerminologyList", patterns)
 98 | doc = dframcy_phrase_matcher.nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
 99 |                                 u"converse in the Oval Office inside the White House in Washington, D.C.")
100 | phrase_matches_dataframe = dframcy_phrase_matcher(doc)
101 | 
102 | # Dependency Matching
103 | dframcy_dependency_matcher = DframCyDependencyMatcher(nlp)
104 | pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}]
105 | dframcy_dependency_matcher.add("FOUNDED", [pattern])
106 | doc = dframcy_dependency_matcher.nlp(u"Bill Gates founded Microsoft. And Elon Musk founded SpaceX")
107 | dependency_matches_dataframe = dframcy_dependency_matcher(doc)
108 | ```
109 | 
110 | ### Command Line Interface
111 | 
112 | Dframcy supports command-line arguments for the conversion of a plain text file to linguistically annotated text in CSV/JSON format.
113 | Previous versions of Dframcy were used to support CLI utilities for training and evaluation of spaCy models from CSV/XLS files.
114 | After the [v3](https://spacy.io/usage/v3) release, spaCy's training pipeline has become much more flexible and robust so didn't want to introduce additional
115 | step using Dframcy for just format conversion (CSV/XLS to [spaCy’s binary format](https://spacy.io/api/data-formats#binary-training)).
116 | 
117 | ```bash
118 | # convert
119 | dframcy dframe -i plain_text.txt -o annotations.csv -f csv
120 | ```
121 | 


--------------------------------------------------------------------------------
/tests/test_matcher.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import spacy
  5 | import pandas as pd
  6 | from pandas.testing import assert_frame_equal
  7 | from dframcy.matcher import (
  8 |     DframCyMatcher,
  9 |     DframCyPhraseMatcher,
 10 |     DframCyDependencyMatcher,
 11 | )
 12 | 
 13 | 
 14 | dframcy_matcher = DframCyMatcher(spacy.load("en_core_web_sm"))
 15 | dframcy_phrase_matcher = DframCyPhraseMatcher(
 16 |     spacy.load("en_core_web_sm"), attr="LOWER"
 17 | )
 18 | dframcy_dependency_matcher = DframCyDependencyMatcher(spacy.load("en_core_web_sm"))
 19 | 
 20 | 
 21 | def test_matcher():
 22 |     dframcy_matcher.reset()
 23 |     pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
 24 |     dframcy_matcher.add("HelloWorld", [pattern])
 25 |     doc = dframcy_matcher.nlp("Hello, world! Hello world!")
 26 |     spacy_matcher = dframcy_matcher.matcher
 27 |     matches = spacy_matcher(doc)
 28 |     assert matches[0][0] == 15578876784678163569
 29 |     assert matches[0][1] == 0
 30 |     assert matches[0][2] == 3
 31 |     assert dframcy_matcher.nlp.vocab.strings[matches[0][0]] == "HelloWorld"
 32 |     assert doc[matches[0][1] : matches[0][2]].text == "Hello, world"
 33 | 
 34 |     dframcy_matcher.remove("HelloWorld")
 35 |     assert "HelloWorld" not in dframcy_matcher.matcher
 36 | 
 37 | 
 38 | def test_matcher_dataframe():
 39 |     dframcy_matcher.reset()
 40 |     pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
 41 |     dframcy_matcher.add("HelloWorld", [pattern])
 42 |     doc = dframcy_matcher.nlp("Hello, world! Hello world!")
 43 |     matches_dataframe = dframcy_matcher(doc)
 44 |     results = pd.DataFrame(
 45 |         {
 46 |             "start": [0],
 47 |             "end": [3],
 48 |             "string_id": ["HelloWorld"],
 49 |             "span_text": ["Hello, world"],
 50 |         }
 51 |     )
 52 |     assert_frame_equal(matches_dataframe, results)
 53 | 
 54 | 
 55 | def test_matcher_dataframe_multiple_patterns():
 56 |     dframcy_matcher.reset()
 57 |     dframcy_matcher.add(
 58 |         "Hello_World",
 59 |         [
 60 |             [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
 61 |             [{"LOWER": "hello"}, {"LOWER": "world"}],
 62 |         ],
 63 |         None,
 64 |     )
 65 |     doc = dframcy_matcher.nlp("Hello, world! Hello world!")
 66 |     matches_dataframe = dframcy_matcher(doc)
 67 |     results = pd.DataFrame(
 68 |         {
 69 |             "start": [0, 4],
 70 |             "end": [3, 6],
 71 |             "string_id": ["Hello_World", "Hello_World"],
 72 |             "span_text": ["Hello, world", "Hello world"],
 73 |         }
 74 |     )
 75 |     assert_frame_equal(matches_dataframe, results)
 76 | 
 77 | 
 78 | def test_phrase_matcher():
 79 |     patterns = [
 80 |         dframcy_phrase_matcher.nlp.make_doc(name)
 81 |         for name in ["Angela Merkel", "Barack Obama"]
 82 |     ]
 83 |     dframcy_phrase_matcher.add("Names", patterns)
 84 |     doc = dframcy_phrase_matcher.nlp("angela merkel and us president barack Obama")
 85 |     spacy_phrase_matcher = dframcy_phrase_matcher.phrase_matcher
 86 |     matches = spacy_phrase_matcher(doc)
 87 |     assert matches[0][0] == 10631222085860127603
 88 |     assert matches[0][1] == 0
 89 |     assert matches[0][2] == 2
 90 |     assert doc[matches[0][1] : matches[0][2]].text == "angela merkel"
 91 |     assert doc[matches[1][1] : matches[1][2]].text == "barack Obama"
 92 | 
 93 |     dframcy_phrase_matcher.remove("Names")
 94 |     assert "Names" not in dframcy_phrase_matcher.phrase_matcher
 95 | 
 96 | 
 97 | def test_phrase_matcher_dataframe():
 98 |     dframcy_phrase_matcher.reset()
 99 |     terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
100 |     patterns = [dframcy_phrase_matcher.nlp.make_doc(text) for text in terms]
101 |     dframcy_phrase_matcher.add("TerminologyList", patterns)
102 |     doc = dframcy_phrase_matcher.nlp(
103 |         "German Chancellor Angela Merkel and US President Barack Obama "
104 |         "converse in the Oval Office inside the White House in Washington, D.C."
105 |     )
106 |     phrase_matches_dataframe = dframcy_phrase_matcher(doc)
107 |     results = pd.DataFrame(
108 |         {
109 |             "start": [2, 7, 19],
110 |             "end": [4, 9, 22],
111 |             "span_text": ["Angela Merkel", "Barack Obama", "Washington, D.C."],
112 |         }
113 |     )
114 |     assert_frame_equal(phrase_matches_dataframe, results)
115 | 
116 | 
117 | def test_dependency_matcher():
118 |     pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}]
119 |     dframcy_dependency_matcher.add("FOUNDED", [pattern])
120 |     doc = dframcy_dependency_matcher.nlp("Bill Gates founded Microsoft.")
121 |     spacy_dependency_matcher = dframcy_dependency_matcher.dependency_matcher
122 |     matches = spacy_dependency_matcher(doc)
123 |     assert matches[0][0] == 4851363122962674176
124 |     assert len(matches[0][1]) == 1
125 |     assert matches[0][1][0] == 2
126 |     assert "FOUNDED" in dframcy_dependency_matcher.dependency_matcher
127 |     dframcy_dependency_matcher.remove("FOUNDED")
128 |     assert "FOUNDED" not in dframcy_dependency_matcher.dependency_matcher
129 | 
130 | 
131 | def test_dependency_matcher_dataframe():
132 |     dframcy_dependency_matcher.reset()
133 |     pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}]
134 |     dframcy_dependency_matcher.add("FOUNDED", [pattern])
135 |     doc = dframcy_dependency_matcher.nlp(
136 |         "Bill Gates founded Microsoft. And Elon Musk founded SpaceX"
137 |     )
138 |     dependency_matches_dataframe = dframcy_dependency_matcher(doc)
139 |     results = pd.DataFrame(
140 |         {"token_index": ["2", "8"], "token_text": ["founded", "founded"]}
141 |     )
142 |     assert_frame_equal(dependency_matches_dataframe, results)
143 | 


--------------------------------------------------------------------------------
/tests/test_dframcy.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import os
  5 | import json
  6 | import spacy
  7 | import pytest
  8 | import pandas as pd
  9 | from dframcy.dframcy import DframCy
 10 | from pandas.testing import assert_frame_equal
 11 | 
 12 | dframcy = DframCy(spacy.load("en_core_web_sm"))
 13 | 
 14 | current_dir = os.path.dirname(os.path.realpath(__file__))
 15 | project_root = "/" + "/".join(current_dir.split("/")[1:-1])
 16 | data_dir = project_root + "/data"
 17 | 
 18 | 
 19 | @pytest.mark.parametrize("text", ["I am here in USA."])
 20 | def test_nlp_pipeline(text):
 21 |     doc = dframcy.nlp(text)
 22 |     assert doc[0].text == "I"
 23 |     assert doc[0].tag_ == "PRP"
 24 |     assert doc[1].lemma_ == "be"
 25 | 
 26 | 
 27 | @pytest.mark.parametrize("text", ["I am here in USA."])
 28 | def test_default_columns(text):
 29 |     doc = dframcy.nlp(text)
 30 |     dataframe = dframcy.to_dataframe(doc)
 31 |     results = pd.DataFrame(
 32 |         {
 33 |             "token_text": ["I", "am", "here", "in", "USA", "."],
 34 |             "token_start": [0, 2, 5, 10, 13, 16],
 35 |             "token_end": [1, 4, 9, 12, 16, 17],
 36 |             "token_pos_": ["PRON", "AUX", "ADV", "ADP", "PROPN", "PUNCT"],
 37 |             "token_tag_": ["PRP", "VBP", "RB", "IN", "NNP", "."],
 38 |             "token_dep_": ["nsubj", "ROOT", "advmod", "prep", "pobj", "punct"],
 39 |             "token_head": ["am", "am", "am", "here", "in", "am"],
 40 |             "token_ent_type_": ["", "", "", "", "GPE", ""],
 41 |         }
 42 |     )
 43 |     assert_frame_equal(dataframe, results)
 44 | 
 45 | 
 46 | @pytest.mark.parametrize("text", ["bright red apples on the tree"])
 47 | def test_unknown_column_value(text):
 48 |     doc = dframcy.nlp(text)
 49 |     dataframe = dframcy.to_dataframe(doc, columns=["id", "start", "end", "apple"])
 50 |     results = pd.DataFrame(
 51 |         {"token_start": [0, 7, 11, 18, 21, 25], "token_end": [6, 10, 17, 20, 24, 29]}
 52 |     )
 53 |     assert_frame_equal(dataframe, results)
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("text", ["I have an apple"])
 57 | def test_custom_attribute(text):
 58 |     from spacy.tokens import Token
 59 | 
 60 |     fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
 61 |     Token.set_extension("is_fruit", getter=fruit_getter)
 62 |     doc = dframcy.nlp(text)
 63 |     dataframe = dframcy.to_dataframe(
 64 |         doc, columns=["id", "start", "end"], custom_attributes=["is_fruit"]
 65 |     )
 66 |     results = pd.DataFrame(
 67 |         {
 68 |             "token_start": [0, 2, 7, 10],
 69 |             "token_end": [1, 6, 9, 15],
 70 |             "token_is_fruit": [False, False, False, True],
 71 |         }
 72 |     )
 73 |     assert_frame_equal(dataframe, results)
 74 | 
 75 | 
 76 | def test_all_columns_thoroughly():
 77 |     doc = dframcy.nlp(
 78 |         "Machine learning is an application of artificial intelligence (AI) that provides systems the "
 79 |         "ability to automatically learn and improve from experience without being explicitly "
 80 |         "programmed. Machine learning focuses on the development of computer programs that can access "
 81 |         "data and use it learn for themselves."
 82 |     )
 83 |     dataframe = dframcy.to_dataframe(
 84 |         doc,
 85 |         [
 86 |             "id",
 87 |             "end",
 88 |             "pos",
 89 |             "tag",
 90 |             "dep",
 91 |             "text",
 92 |             "head",
 93 |             "pos_",
 94 |             "tag_",
 95 |             "dep_",
 96 |             "orth",
 97 |             "norm",
 98 |             "lang",
 99 |             "orth_",
100 |             "norm_",
101 |             "lang_",
102 |             "lefts",
103 |             "start",
104 |             "lower",
105 |             "shape",
106 |             "lemma_",
107 |             "lower_",
108 |             "shape_",
109 |             "is_oov",
110 |             "rights",
111 |             "ent_id",
112 |             "prefix",
113 |             "suffix",
114 |             "ent_id_",
115 |             "prefix_",
116 |             "suffix_",
117 |             "is_stop",
118 |             "n_lefts",
119 |             "subtree",
120 |             "ent_iob",
121 |             "ent_iob_",
122 |             "is_alpha",
123 |             "is_ascii",
124 |             "is_digit",
125 |             "is_lower",
126 |             "is_upper",
127 |             "is_title",
128 |             "is_punct",
129 |             "is_space",
130 |             "is_quote",
131 |             "like_url",
132 |             "like_num",
133 |             "children",
134 |             "n_rights",
135 |             "ent_type",
136 |             "left_edge",
137 |             "ent_type_",
138 |             "ancestors",
139 |             "conjuncts",
140 |             "right_edge",
141 |             "ent_kb_id_",
142 |             "is_bracket",
143 |             "like_email",
144 |             "has_vector",
145 |             "is_currency",
146 |             "is_left_punct",
147 |             "is_sent_start",
148 |             "is_right_punct",
149 |         ],
150 |     )
151 | 
152 |     assert dataframe.shape == (48, 62)
153 |     assert dataframe["token_ancestors"][0] == "learning, is"
154 |     assert (dataframe.token_is_lower).sum() == 41
155 |     assert (~dataframe.token_is_lower).sum() == 7
156 | 
157 | 
158 | def test_entity_rule_dataframe():
159 |     dframcy_test_ent = DframCy(spacy.load("en_core_web_sm"))
160 |     patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
161 |     dframcy_test_ent.add_entity_ruler(patterns)
162 |     doc = dframcy_test_ent.nlp("MyCorp Inc. is a company in the U.S.")
163 |     _, entity_frame = dframcy_test_ent.to_dataframe(doc, separate_entity_dframe=True)
164 |     results = pd.DataFrame(
165 |         {"ent_text": ["MyCorp Inc.", "U.S."], "ent_label": ["ORG", "GPE"]}
166 |     )
167 |     assert_frame_equal(entity_frame, results)
168 | 
169 | 
170 | def test_sentence_without_named_entities():
171 |     doc = dframcy.nlp("Autonomous cars shift insurance liability toward manufacturers.")
172 |     dataframe = dframcy.to_dataframe(doc, ["pos_", "tag_", "ent_type_"])
173 | 
174 |     assert "token_ent_type_" not in dataframe.columns
175 | 


--------------------------------------------------------------------------------
/dframcy/dframcy.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from dframcy import utils
  7 | 
  8 | 
  9 | class DframCy(object):
 10 |     """
 11 |     Dataframe integration with spaCy's linguistic annotations.
 12 |     """
 13 | 
 14 |     def __init__(self, nlp_pipeline):
 15 |         """
 16 |         :param nlp_pipeline: nlp pipeline to be used (i.e. language model).
 17 |         """
 18 |         self._nlp = nlp_pipeline
 19 | 
 20 |     @property
 21 |     def nlp(self):
 22 |         """
 23 |         To get texted nlped
 24 |         :return: Spacy's Doc object
 25 |         """
 26 |         return self._nlp
 27 | 
 28 |     @staticmethod
 29 |     def get_token_attribute_value(token, attribute_name, _type):
 30 |         """
 31 |         To get value of specific attribute of spacy's Token class
 32 |         :param token: token object of class Token
 33 |         :param attribute_name: name attribute for which value is required
 34 |         :param _type: type of class attribute (property, attribute)
 35 |         :retrun: attribute value
 36 |         """
 37 |         if _type == "attribute" or _type == "int_format_attribute":
 38 |             value = getattr(token, attribute_name)
 39 |             if attribute_name in ["head", "left_edge", "right_edge"]:
 40 |                 return value.text
 41 |             else:
 42 |                 return value
 43 |         elif _type == "property":
 44 |             value = getattr(token, attribute_name)
 45 |             if attribute_name in ["n_lefts", "n_rights", "has_vector", "is_sent_start"]:
 46 |                 return value
 47 |             else:
 48 |                 return ", ".join([v.text for v in value])
 49 |         elif _type == "additional_attribute":
 50 |             if attribute_name == "id":
 51 |                 return getattr(token, "i")
 52 |             elif attribute_name == "start":
 53 |                 return getattr(token, "idx")
 54 |             elif attribute_name == "end":
 55 |                 return getattr(token, "idx") + len(token)
 56 |         elif _type == "custom_attributes":
 57 |             return getattr(getattr(token, "_"), attribute_name)
 58 | 
 59 |     def get_token_attribute_dict(self, doc, consistent_columns):
 60 |         """
 61 |         To get attribute dictionary for sequence of Token object in Doc
 62 |         :param doc: Doc object
 63 |         :param consistent_columns: name attributes required with its type
 64 |         :return: python dictionary containing attributes names as keys
 65 |                 and list of all token values as value.
 66 |         """
 67 |         token_attribute_dictionary = {}
 68 |         for token in doc:
 69 |             for column_name in consistent_columns:
 70 |                 if column_name[0] in token_attribute_dictionary:
 71 |                     token_attribute_dictionary[column_name[0]].append(
 72 |                         self.get_token_attribute_value(
 73 |                             token, column_name[0], column_name[1]
 74 |                         )
 75 |                     )
 76 |                 else:
 77 |                     token_attribute_dictionary[column_name[0]] = []
 78 |                     token_attribute_dictionary[column_name[0]].append(
 79 |                         self.get_token_attribute_value(
 80 |                             token, column_name[0], column_name[1]
 81 |                         )
 82 |                     )
 83 |         return token_attribute_dictionary
 84 | 
 85 |     @staticmethod
 86 |     def get_named_entity_dict(doc):
 87 |         """
 88 |         To get named entities from NLP processed text
 89 |         :param doc: spacy container for linguistic annotations.
 90 |         :return: dictionary containing entity_text and entity_label
 91 |         """
 92 |         entity_details_dict = {"ent_text": [], "ent_label": []}
 93 |         for ent in doc.ents:
 94 |             entity_details_dict["ent_text"].append(ent.text)
 95 |             entity_details_dict["ent_label"].append(ent.label_)
 96 |         return entity_details_dict
 97 | 
 98 |     def to_dataframe(
 99 |         self, doc, columns=None, separate_entity_dframe=False, custom_attributes=None
100 |     ):
101 |         """
102 |         Convert Linguistic annotations for text into pandas dataframe
103 |         :param doc: spacy container for linguistic annotations.
104 |         :param columns: list of str, name of columns to be included in dataframe (default:
105 |         ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"])
106 |         :param separate_entity_dframe: bool, for separate entity dataframe (default: False)
107 |         :param custom_attributes: list, for custom attribute
108 |         :return: dataframe, dataframe containing linguistic annotations
109 |         """
110 |         if columns is None:
111 |             columns = utils.get_default_columns()
112 | 
113 |         if "id" not in columns:
114 |             columns = ["id"] + columns
115 | 
116 |         consistent_columns = utils.check_columns_consistency(columns)
117 | 
118 |         if custom_attributes:
119 |             consistent_columns += [
120 |                 (attr, "custom_attributes") for attr in custom_attributes
121 |             ]
122 | 
123 |         token_attribute_dictionary = self.get_token_attribute_dict(
124 |             doc, consistent_columns
125 |         )
126 |         tokens_dataframe = pd.DataFrame.from_dict(token_attribute_dictionary)
127 | 
128 |         new_column_names_map = {i: "token_" + i for i in tokens_dataframe.columns}
129 | 
130 |         tokens_dataframe.rename(columns=new_column_names_map, inplace=True)
131 | 
132 |         tokens_dataframe.reindex(tokens_dataframe["token_id"])
133 | 
134 |         tokens_dataframe.drop(columns=["token_id"], inplace=True)
135 | 
136 |         if not doc.ents and "token_ent_type_" in tokens_dataframe.columns:
137 |             tokens_dataframe.drop(columns=["token_ent_type_"], inplace=True)
138 | 
139 |         if separate_entity_dframe:
140 |             entity_dict = self.get_named_entity_dict(doc)
141 |             entity_dataframe = pd.DataFrame.from_dict(entity_dict)
142 | 
143 |         return (
144 |             tokens_dataframe
145 |             if not separate_entity_dframe
146 |             else (tokens_dataframe, entity_dataframe)
147 |         )
148 | 
149 |     def add_entity_ruler(self, patterns):
150 |         """
151 |         To add entity ruler in nlp pipeline
152 |         official doc: https://spacy.io/api/entityruler
153 |         :param patterns: list or list of lists of token/phrase based patterns
154 |         """
155 |         ruler = self._nlp.add_pipe("entity_ruler")
156 |         ruler.add_patterns(patterns)
157 | 


--------------------------------------------------------------------------------
/dframcy/matcher.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import pandas as pd
  5 | from spacy.matcher import Matcher
  6 | from spacy.matcher import PhraseMatcher
  7 | from spacy.matcher import DependencyMatcher
  8 | 
  9 | 
 10 | class DframCyMatcher(object):
 11 |     """
 12 |     Dataframe wrapper class over spaCy's Matcher
 13 |     https://spacy.io/api/matcher
 14 |     """
 15 | 
 16 |     def __init__(self, nlp_pipeline, validate=False):
 17 |         """
 18 |         :param nlp_pipeline: nlp pipeline to be used (i.e. language model).
 19 |         """
 20 |         self._nlp = nlp_pipeline
 21 |         self._matcher = None
 22 |         self.validate = validate
 23 | 
 24 |     @property
 25 |     def nlp(self):
 26 |         return self._nlp
 27 | 
 28 |     @property
 29 |     def matcher(self):
 30 |         return self._matcher
 31 | 
 32 |     def __call__(self, doc):
 33 |         """
 34 |         To find all token sequences matching the supplied patterns on the Doc
 35 |         :param doc: spacy container for linguistic annotations.
 36 |         :return: dataframe, containing matched occurrences.
 37 |         """
 38 |         df_format_json = {}
 39 |         matches = self._matcher(doc)
 40 |         for match_id, start, end in matches:
 41 |             if "match_id" not in df_format_json:
 42 |                 df_format_json["match_id"] = []
 43 |                 df_format_json["match_id"].append(match_id)
 44 |             else:
 45 |                 df_format_json["match_id"].append(match_id)
 46 |             if "start" not in df_format_json:
 47 |                 df_format_json["start"] = []
 48 |                 df_format_json["start"].append(start)
 49 |             else:
 50 |                 df_format_json["start"].append(start)
 51 |             if "end" not in df_format_json:
 52 |                 df_format_json["end"] = []
 53 |                 df_format_json["end"].append(end)
 54 |             else:
 55 |                 df_format_json["end"].append(end)
 56 |             if "string_id" not in df_format_json:
 57 |                 df_format_json["string_id"] = []
 58 |                 df_format_json["string_id"].append(self._nlp.vocab.strings[match_id])
 59 |             else:
 60 |                 df_format_json["string_id"].append(self._nlp.vocab.strings[match_id])
 61 |             if "span_text" not in df_format_json:
 62 |                 df_format_json["span_text"] = []
 63 |                 df_format_json["span_text"].append(doc[start:end].text)
 64 |             else:
 65 |                 df_format_json["span_text"].append(doc[start:end].text)
 66 |         matches_dataframe = pd.DataFrame.from_dict(df_format_json)
 67 |         matches_dataframe.reindex(matches_dataframe["match_id"])
 68 |         matches_dataframe.drop(columns=["match_id"], inplace=True)
 69 | 
 70 |         return matches_dataframe
 71 | 
 72 |     def get_matcher(self):
 73 |         """
 74 |         To initialize spaCy's matcher class object.
 75 |         :return: Matcher object
 76 |         """
 77 |         return Matcher(self._nlp.vocab, validate=self.validate)
 78 | 
 79 |     def add(self, pattern_name, patterns, callback=None):
 80 |         """
 81 |         To add patterns to spaCy's matcher object
 82 |         :param pattern_name: str, pattern name
 83 |         :param patterns: list of patterns
 84 |         :param callback: function, callback function to be invoked on matched occurrences.
 85 |         """
 86 |         if not self._matcher:
 87 |             self._matcher = self.get_matcher()
 88 |         self._matcher.add(pattern_name, patterns, on_match=callback)
 89 | 
 90 |     def remove(self, pattern_name):
 91 |         """
 92 |         To remove pattern from spaCy's matcher object
 93 |         :param pattern_name: str, pattern_name
 94 |         """
 95 |         if self._matcher:
 96 |             self._matcher.remove(pattern_name)
 97 | 
 98 |     def reset(self):
 99 |         """
100 |         To re-initialize spaCy's matcher object
101 |         """
102 |         self._matcher = self.get_matcher()
103 | 
104 | 
105 | class DframCyPhraseMatcher(object):
106 |     """
107 |     Dataframe wrapper class over spaCy's PhraseMatcher
108 |     https://spacy.io/api/phrasematcher
109 |     """
110 | 
111 |     def __init__(self, nlp_pipeline, attr="ORTH", validate=False):
112 |         """
113 |         :param nlp_pipeline: nlp pipeline to be used (i.e. language model).
114 |         :param attr: str, token attribute to match on (default: "ORTH")
115 |         """
116 |         self._nlp = nlp_pipeline
117 |         self._phrase_matcher = None
118 |         self.attribute = attr
119 |         self.validate = validate
120 | 
121 |     @property
122 |     def nlp(self):
123 |         return self._nlp
124 | 
125 |     @property
126 |     def phrase_matcher(self):
127 |         return self._phrase_matcher
128 | 
129 |     def __call__(self, doc):
130 |         """
131 |         To find all token sequences matching the supplied patterns on the Doc
132 |         :param doc: spacy container for linguistic annotations.
133 |         :return: dataframe, containing matched occurrences.
134 |         """
135 |         df_format_json = {}
136 |         phrase_matches = self._phrase_matcher(doc)
137 |         for match_id, start, end in phrase_matches:
138 |             if "match_id" not in df_format_json:
139 |                 df_format_json["match_id"] = []
140 |                 df_format_json["match_id"].append(match_id)
141 |             else:
142 |                 df_format_json["match_id"].append(match_id)
143 |             if "start" not in df_format_json:
144 |                 df_format_json["start"] = []
145 |                 df_format_json["start"].append(start)
146 |             else:
147 |                 df_format_json["start"].append(start)
148 |             if "end" not in df_format_json:
149 |                 df_format_json["end"] = []
150 |                 df_format_json["end"].append(end)
151 |             else:
152 |                 df_format_json["end"].append(end)
153 |             if "span_text" not in df_format_json:
154 |                 df_format_json["span_text"] = []
155 |                 df_format_json["span_text"].append(doc[start:end].text)
156 |             else:
157 |                 df_format_json["span_text"].append(doc[start:end].text)
158 |         phrase_matches_dataframe = pd.DataFrame.from_dict(df_format_json)
159 |         phrase_matches_dataframe.reindex(phrase_matches_dataframe["match_id"])
160 |         phrase_matches_dataframe.drop(columns=["match_id"], inplace=True)
161 | 
162 |         return phrase_matches_dataframe
163 | 
164 |     def get_phrase_matcher(self):
165 |         """
166 |         To get spaCy's phrase matcher class object (used for testing only).
167 |         :return: phrase matcher object
168 |         """
169 |         return PhraseMatcher(
170 |             self._nlp.vocab, attr=self.attribute, validate=self.validate
171 |         )
172 | 
173 |     def add(self, pattern_name, patterns, callback=None):
174 |         """
175 |         To add patterns to spaCy's phrase matcher object
176 |         :param pattern_name: str, pattern name
177 |         :param pattern: list of patterns
178 |         :param callback: function, callback function to be invoked on matched occurrences.
179 |         """
180 |         if not self._phrase_matcher:
181 |             self._phrase_matcher = self.get_phrase_matcher()
182 |         self._phrase_matcher.add(pattern_name, patterns, on_match=callback)
183 | 
184 |     def remove(self, pattern_name):
185 |         """
186 |         To remove pattern from spaCy's matcher object
187 |         :param pattern_name: str, pattern_name
188 |         """
189 |         if self._phrase_matcher:
190 |             self._phrase_matcher.remove(pattern_name)
191 | 
192 |     def reset(self, change_attribute=None):
193 |         """
194 |         To re-initialize spaCy's phrase matcher object
195 |         :param change_attribute: token attribute to match on
196 |         """
197 |         if change_attribute:
198 |             self.attribute = change_attribute
199 |         self._phrase_matcher = self.get_phrase_matcher()
200 | 
201 | 
202 | class DframCyDependencyMatcher(object):
203 |     """
204 |     Dataframe wrapper class over spaCy's DependencyMatcher
205 |     https://spacy.io/api/dependencymatcher
206 |     """
207 | 
208 |     def __init__(self, nlp_pipeline, validate=False):
209 |         """
210 |         :param nlp_pipeline: nlp pipeline to be used (i.e. language model).
211 |         :param validate: bool, performs validation on petterns (default: false)
212 |         """
213 |         self._nlp = nlp_pipeline
214 |         self._dependency_matcher = None
215 |         self.validate = validate
216 | 
217 |     @property
218 |     def nlp(self):
219 |         return self._nlp
220 | 
221 |     @property
222 |     def dependency_matcher(self):
223 |         return self._dependency_matcher
224 | 
225 |     def __call__(self, doc):
226 |         """
227 |         To find all token sequences matching the supplied patterns on the Doc
228 |         :param doc: spacy container for linguistic annotations.
229 |         :return: dataframe, containing matched occurrences.
230 |         """
231 |         df_format_json = {}
232 |         dependency_matches = self._dependency_matcher(doc)
233 |         for match_id, matched_token_indices in dependency_matches:
234 |             if "match_id" not in df_format_json:
235 |                 df_format_json["match_id"] = []
236 |                 df_format_json["match_id"].append(match_id)
237 |             else:
238 |                 df_format_json["match_id"].append(match_id)
239 |             if "token_index" not in df_format_json:
240 |                 df_format_json["token_index"] = []
241 |                 df_format_json["token_index"].append(
242 |                     ", ".join([str(i) for i in matched_token_indices])
243 |                 )
244 |             else:
245 |                 df_format_json["token_index"].append(
246 |                     ", ".join([str(i) for i in matched_token_indices])
247 |                 )
248 |             if "token_text" not in df_format_json:
249 |                 df_format_json["token_text"] = []
250 |                 df_format_json["token_text"].append(
251 |                     ", ".join([doc[i].text for i in matched_token_indices])
252 |                 )
253 |             else:
254 |                 df_format_json["token_text"].append(
255 |                     ", ".join([doc[i].text for i in matched_token_indices])
256 |                 )
257 |         dependency_matches_dataframe = pd.DataFrame.from_dict(df_format_json)
258 |         dependency_matches_dataframe.reindex(dependency_matches_dataframe["match_id"])
259 |         dependency_matches_dataframe.drop(columns=["match_id"], inplace=True)
260 | 
261 |         return dependency_matches_dataframe
262 | 
263 |     def get_dependency_matcher(self):
264 |         """
265 |         To get spaCy's dependency matcher class object (used for testing only).
266 |         :return: dependency matcher object
267 |         """
268 |         return DependencyMatcher(self._nlp.vocab, validate=self.validate)
269 | 
270 |     def add(self, pattern_name, patterns, callback=None):
271 |         """
272 |         To add patterns to spaCy's dependency matcher object
273 |         :param pattern_name: str, pattern name
274 |         :param pattern: list of patterns
275 |         :param callback: function, callback function to be invoked on matched occurrences.
276 |         """
277 |         if not self._dependency_matcher:
278 |             self._dependency_matcher = self.get_dependency_matcher()
279 |         self._dependency_matcher.add(pattern_name, patterns, on_match=callback)
280 | 
281 |     def remove(self, pattern_name):
282 |         """
283 |         To remove pattern from spaCy's matcher object
284 |         :param pattern_name: str, pattern_name
285 |         """
286 |         if self._dependency_matcher:
287 |             self._dependency_matcher.remove(pattern_name)
288 | 
289 |     def reset(self):
290 |         """
291 |         To re-initialize spaCy's dependency matcher object
292 |         :param change_attribute: token attribute to match on
293 |         """
294 |         self._dependency_matcher = self.get_dependency_matcher()
295 | 


--------------------------------------------------------------------------------