├── tests ├── __init__.py ├── test_matcher.py └── test_dframcy.py ├── requirements.txt ├── .gitignore ├── .travis.yml ├── MANIFEST.in ├── dframcy ├── __init__.py ├── cli.py ├── utils.py ├── dframcy.py └── matcher.py ├── tox.ini ├── .coveragerc ├── LICENSE ├── setup.py └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | wasabi 3 | pytest 4 | pytest-cov 5 | tox 6 | tox-travis 7 | pandas 8 | xlrd 9 | spacy>=3.0.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea/ 3 | .coverage 4 | dframcy.egg-info/ 5 | build/ 6 | dist/ 7 | htmlcov/ 8 | .tox/ 9 | coverage.xml 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | dist: bionic 3 | python: 4 | - "3.6" 5 | install: 6 | - pip install -r requirements.txt 7 | - pip install . 8 | script: 9 | - tox -vv -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | include setup.cfg 5 | include data/* 6 | 7 | global-exclude __pycache__ 8 | global-exclude *.py[co] 9 | global-exclude .* -------------------------------------------------------------------------------- /dframcy/__init__.py: -------------------------------------------------------------------------------- 1 | from dframcy.dframcy import DframCy 2 | from dframcy.matcher import DframCyMatcher, DframCyPhraseMatcher 3 | 4 | __version__ = "0.1.6" 5 | __all__ = ["DframCy", "DframCyMatcher", "DframCyPhraseMatcher"] 6 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{36} 3 | skip_missing_interpreters = True 4 | [testenv] 5 | passenv = 6 | CI 7 | TRAVIS 8 | TRAVIS_* 9 | deps = 10 | codecov 11 | commands = 12 | python3 -m spacy download en 13 | py.test --cov-report=xml --cov=dframcy tests/ 14 | codecov -e TOXENV -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | dframcy 4 | tests 5 | branch = False 6 | omit = setup.py 7 | dframcy/cli.py 8 | 9 | [report] 10 | show_missing = True 11 | exclude_lines = 12 | pragma: no cover 13 | def __repr__ 14 | RuntimeError 15 | NotImplementedError 16 | FileNotFoundError 17 | ImportError 18 | KeyError 19 | IOError 20 | if __name__ == .__main__.: 21 | messenger -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2019 YASH 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from io import open 3 | from setuptools import setup, find_packages 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 8 | long_description = f.read() 9 | 10 | with open(path.join(here, "requirements.txt"), encoding="utf-8") as f: 11 | REQUIRES = [line.strip("\n") for line in f] 12 | 13 | setup( 14 | name="dframcy", 15 | version="0.1.6", 16 | description="Pandas Dataframe integration for spaCy", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | url="https://github.com/yash1994/dframcy", 20 | author="Yash Patadia", 21 | author_email="yash@patadia.org", 22 | classifiers=[ 23 | "Development Status :: 3 - Alpha", 24 | "Intended Audience :: Developers", 25 | "Topic :: Software Development :: Build Tools", 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python :: 3.6", 28 | ], 29 | keywords=["spacy", "dataframe", "pandas"], 30 | packages=find_packages(), 31 | install_requires=REQUIRES, 32 | tests_require=["pytest"], 33 | entry_points={ 34 | "console_scripts": ["dframcy=dframcy.cli:main"], 35 | }, 36 | ) 37 | -------------------------------------------------------------------------------- /dframcy/cli.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import json 5 | import click 6 | import spacy 7 | from io import open 8 | from pathlib import Path 9 | from wasabi import Printer 10 | from .dframcy import DframCy 11 | from .utils import get_default_columns 12 | 13 | messenger = Printer() 14 | DEFAULT_COLUMNS = ",".join(get_default_columns()) 15 | 16 | 17 | @click.group() 18 | def main(): 19 | pass 20 | 21 | 22 | @main.command() 23 | @click.option( 24 | "--input_file", "-i", required=True, type=Path, help="Input text file path." 25 | ) 26 | @click.option("--output_file", "-o", required=True, type=Path, help="Output file path.") 27 | @click.option( 28 | "--format", 29 | "-f", 30 | default="csv", 31 | show_default=True, 32 | type=click.Choice(["json", "csv"], case_sensitive=False), 33 | help="Output file format (json/csv)", 34 | ) 35 | @click.option( 36 | "--language_model", 37 | "-l", 38 | default="en_core_web_sm", 39 | show_default=True, 40 | type=str, 41 | help="Language model to be used.", 42 | ) 43 | @click.option( 44 | "--columns", 45 | "-c", 46 | default=DEFAULT_COLUMNS, 47 | show_default=True, 48 | type=str, 49 | help="Annotations to be included in dataframe.", 50 | ) 51 | @click.option( 52 | "--separate_entity_frame", 53 | "-s", 54 | default=False, 55 | show_default=True, 56 | type=bool, 57 | help="Save separate entity dataframe.", 58 | ) 59 | def dframe( 60 | input_file, output_file, format, language_model, columns, separate_entity_frame 61 | ): 62 | format = format.lower() 63 | 64 | if output_file.is_dir(): 65 | output_file = output_file.joinpath(input_file.stem + "." + str(format)) 66 | if input_file.exists(): 67 | with open(input_file, "r") as infile: 68 | text = infile.read().strip("\n").strip() 69 | 70 | nlp = spacy.load(language_model) 71 | dframcy = DframCy(nlp) 72 | doc = nlp(text) 73 | 74 | annotation_dataframe = dframcy.to_dataframe( 75 | doc=doc, 76 | columns=columns.split(","), 77 | separate_entity_dframe=separate_entity_frame, 78 | ) 79 | 80 | if separate_entity_frame: 81 | token_annotation_dataframe, entity_dataframe = annotation_dataframe 82 | else: 83 | token_annotation_dataframe = annotation_dataframe 84 | entity_dataframe = None 85 | 86 | if format == "csv": 87 | token_annotation_dataframe.to_csv(output_file) 88 | if separate_entity_frame: 89 | entity_output_file = Path( 90 | str(output_file).strip(".csv") + "_entity.csv" 91 | ) 92 | entity_dataframe.to_csv(entity_output_file) 93 | elif format == "json": 94 | annotation_json = token_annotation_dataframe.to_json(orient="columns") 95 | json.dump(annotation_json, open(output_file, "w")) 96 | if separate_entity_frame: 97 | entity_output_file = Path( 98 | str(output_file).strip(".json") + "_entity.json" 99 | ) 100 | json.dump(entity_dataframe, open(entity_output_file, "w")) 101 | else: 102 | messenger.fail( 103 | "input file path: {} does not exist".format(input_file), exits=-1 104 | ) 105 | -------------------------------------------------------------------------------- /dframcy/utils.py: -------------------------------------------------------------------------------- 1 | from wasabi import Printer 2 | 3 | messenger = Printer() 4 | 5 | 6 | def get_default_columns(): 7 | """ 8 | Default columns for dataframe 9 | :return: list of default attributes 10 | """ 11 | return ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"] 12 | 13 | 14 | def get_spacy_token_class_config(): 15 | """ 16 | Configuration of spacy's Token class attribute 17 | :return: config dictionary of attributes/properties 18 | """ 19 | token_config = { 20 | "PROPERTIES": [ 21 | "lefts", 22 | "rights", 23 | "n_lefts", 24 | "subtree", 25 | "children", 26 | "n_rights", 27 | "ancestors", 28 | "conjuncts", 29 | "has_vector", 30 | "is_sent_start", 31 | ], 32 | "ATTRIBUTES": [ 33 | "text", 34 | "head", 35 | "pos_", 36 | "tag_", 37 | "dep_", 38 | "orth_", 39 | "norm_", 40 | "lang_", 41 | "lemma_", 42 | "lower_", 43 | "shape_", 44 | "is_oov", 45 | "ent_id_", 46 | "prefix_", 47 | "suffix_", 48 | "is_stop", 49 | "ent_iob_", 50 | "is_alpha", 51 | "is_ascii", 52 | "is_digit", 53 | "is_lower", 54 | "is_upper", 55 | "is_title", 56 | "is_punct", 57 | "is_space", 58 | "is_quote", 59 | "like_url", 60 | "like_num", 61 | "left_edge", 62 | "ent_type_", 63 | "right_edge", 64 | "ent_kb_id_", 65 | "is_bracket", 66 | "like_email", 67 | "is_currency", 68 | "is_left_punct", 69 | "is_right_punct", 70 | ], 71 | "ADDITIONAL_ATTRIBUTES": ["id", "start", "end"], 72 | "INT_FORMAT_ATTRIBUTES": [ 73 | "pos", 74 | "tag", 75 | "dep", 76 | "orth", 77 | "norm", 78 | "lang", 79 | "lower", 80 | "shape", 81 | "ent_id", 82 | "prefix", 83 | "suffix", 84 | "ent_iob", 85 | "ent_type", 86 | ], 87 | } 88 | 89 | return token_config 90 | 91 | 92 | def check_columns_consistency(columns): 93 | """ 94 | Checks consistency of column names passed by users 95 | with spacy's Token class. 96 | :param columns: list of column names 97 | :return: list of consistent column names 98 | """ 99 | spacy_token_config = get_spacy_token_class_config() 100 | consistent_column_names = [] 101 | for column_name in columns: 102 | if column_name in spacy_token_config["PROPERTIES"]: 103 | consistent_column_names.append((column_name, "property")) 104 | elif column_name in spacy_token_config["ATTRIBUTES"]: 105 | consistent_column_names.append((column_name, "attribute")) 106 | elif column_name in spacy_token_config["ADDITIONAL_ATTRIBUTES"]: 107 | consistent_column_names.append((column_name, "additional_attribute")) 108 | elif column_name in spacy_token_config["INT_FORMAT_ATTRIBUTES"]: 109 | consistent_column_names.append((column_name, "int_format_attribute")) 110 | else: 111 | messenger.warn( 112 | "Column name '{}' not consistent with spacy's Token class".format( 113 | column_name 114 | ) 115 | ) 116 | 117 | return consistent_column_names 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DframCy 2 | 3 | [![Package Version](https://img.shields.io/pypi/v/dframcy.svg?&service=github)](https://pypi.python.org/pypi/dframcy/) 4 | [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) 5 | [![Build Status](https://travis-ci.org/yash1994/dframcy.svg?branch=master)](https://travis-ci.org/yash1994/dframcy) 6 | [![codecov](https://codecov.io/gh/yash1994/dframcy/branch/master/graph/badge.svg)](https://codecov.io/gh/yash1994/dframcy) 7 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 8 | 9 | DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks. DframCy provides clean APIs to convert spaCy's linguistic annotations, Matcher and PhraseMatcher information to Pandas dataframe, also supports training and evaluation of NLP pipeline from CSV/XLXS/XLS without any changes to spaCy's underlying APIs. 10 | 11 | ## Getting Started 12 | 13 | DframCy can be easily installed. Just need to the following: 14 | 15 | ### Requirements 16 | 17 | * Python 3.6 or later 18 | * Pandas 19 | * spaCy >= 3.0.0 20 | 21 | Also need to download spaCy's language model: 22 | 23 | ```bash 24 | python -m spacy download en_core_web_sm 25 | ``` 26 | 27 | For more information refer to: [Models & Languages](https://spacy.io/usage/models) 28 | 29 | ### Installation: 30 | 31 | This package can be installed from [PyPi](https://pypi.org/project/dframcy/) by running: 32 | 33 | ```bash 34 | pip install dframcy 35 | ``` 36 | 37 | To build from source: 38 | 39 | ```bash 40 | git clone https://github.com/yash1994/dframcy.git 41 | cd dframcy 42 | python setup.py install 43 | ``` 44 | 45 | ## Usage 46 | 47 | ### Linguistic Annotations 48 | 49 | Get linguistic annotation in the dataframe. For linguistic annotations (dataframe column names) refer to [spaCy's Token API](https://spacy.io/api/token) document. 50 | 51 | ```python 52 | import spacy 53 | from dframcy import DframCy 54 | 55 | nlp = spacy.load("en_core_web_sm") 56 | 57 | dframcy = DframCy(nlp) 58 | doc = dframcy.nlp(u"Apple is looking at buying U.K. startup for $1 billion") 59 | 60 | # default columns: ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"] 61 | annotation_dataframe = dframcy.to_dataframe(doc) 62 | 63 | # can also pass columns names (spaCy's linguistic annotation attributes) 64 | annotation_dataframe = dframcy.to_dataframe(doc, columns=["text", "lemma_", "lower_", "is_punct"]) 65 | 66 | # for separate entity dataframe 67 | token_annotation_dataframe, entity_dataframe = dframcy.to_dataframe(doc, separate_entity_dframe=True) 68 | 69 | # custom attributes can also be included 70 | from spacy.tokens import Token 71 | fruit_getter = lambda token: token.text in ("apple", "pear", "banana") 72 | Token.set_extension("is_fruit", getter=fruit_getter) 73 | doc = dframcy.nlp(u"I have an apple") 74 | 75 | annotation_dataframe = dframcy.to_dataframe(doc, custom_attributes=["is_fruit"]) 76 | ``` 77 | 78 | ### Rule-Based Matching 79 | 80 | ```python 81 | # Token-based Matching 82 | import spacy 83 | 84 | nlp = spacy.load("en_core_web_sm") 85 | 86 | from dframcy.matcher import DframCyMatcher, DframCyPhraseMatcher, DframCyDependencyMatcher 87 | dframcy_matcher = DframCyMatcher(nlp) 88 | pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] 89 | dframcy_matcher.add("HelloWorld", [pattern]) 90 | doc = dframcy_matcher.nlp("Hello, world! Hello world!") 91 | matches_dataframe = dframcy_matcher(doc) 92 | 93 | # Phrase Matching 94 | dframcy_phrase_matcher = DframCyPhraseMatcher(nlp) 95 | terms = [u"Barack Obama", u"Angela Merkel",u"Washington, D.C."] 96 | patterns = [dframcy_phrase_matcher.nlp.make_doc(text) for text in terms] 97 | dframcy_phrase_matcher.add("TerminologyList", patterns) 98 | doc = dframcy_phrase_matcher.nlp(u"German Chancellor Angela Merkel and US President Barack Obama " 99 | u"converse in the Oval Office inside the White House in Washington, D.C.") 100 | phrase_matches_dataframe = dframcy_phrase_matcher(doc) 101 | 102 | # Dependency Matching 103 | dframcy_dependency_matcher = DframCyDependencyMatcher(nlp) 104 | pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}] 105 | dframcy_dependency_matcher.add("FOUNDED", [pattern]) 106 | doc = dframcy_dependency_matcher.nlp(u"Bill Gates founded Microsoft. And Elon Musk founded SpaceX") 107 | dependency_matches_dataframe = dframcy_dependency_matcher(doc) 108 | ``` 109 | 110 | ### Command Line Interface 111 | 112 | Dframcy supports command-line arguments for the conversion of a plain text file to linguistically annotated text in CSV/JSON format. 113 | Previous versions of Dframcy were used to support CLI utilities for training and evaluation of spaCy models from CSV/XLS files. 114 | After the [v3](https://spacy.io/usage/v3) release, spaCy's training pipeline has become much more flexible and robust so didn't want to introduce additional 115 | step using Dframcy for just format conversion (CSV/XLS to [spaCy’s binary format](https://spacy.io/api/data-formats#binary-training)). 116 | 117 | ```bash 118 | # convert 119 | dframcy dframe -i plain_text.txt -o annotations.csv -f csv 120 | ``` 121 | -------------------------------------------------------------------------------- /tests/test_matcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import spacy 5 | import pandas as pd 6 | from pandas.testing import assert_frame_equal 7 | from dframcy.matcher import ( 8 | DframCyMatcher, 9 | DframCyPhraseMatcher, 10 | DframCyDependencyMatcher, 11 | ) 12 | 13 | 14 | dframcy_matcher = DframCyMatcher(spacy.load("en_core_web_sm")) 15 | dframcy_phrase_matcher = DframCyPhraseMatcher( 16 | spacy.load("en_core_web_sm"), attr="LOWER" 17 | ) 18 | dframcy_dependency_matcher = DframCyDependencyMatcher(spacy.load("en_core_web_sm")) 19 | 20 | 21 | def test_matcher(): 22 | dframcy_matcher.reset() 23 | pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] 24 | dframcy_matcher.add("HelloWorld", [pattern]) 25 | doc = dframcy_matcher.nlp("Hello, world! Hello world!") 26 | spacy_matcher = dframcy_matcher.matcher 27 | matches = spacy_matcher(doc) 28 | assert matches[0][0] == 15578876784678163569 29 | assert matches[0][1] == 0 30 | assert matches[0][2] == 3 31 | assert dframcy_matcher.nlp.vocab.strings[matches[0][0]] == "HelloWorld" 32 | assert doc[matches[0][1] : matches[0][2]].text == "Hello, world" 33 | 34 | dframcy_matcher.remove("HelloWorld") 35 | assert "HelloWorld" not in dframcy_matcher.matcher 36 | 37 | 38 | def test_matcher_dataframe(): 39 | dframcy_matcher.reset() 40 | pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] 41 | dframcy_matcher.add("HelloWorld", [pattern]) 42 | doc = dframcy_matcher.nlp("Hello, world! Hello world!") 43 | matches_dataframe = dframcy_matcher(doc) 44 | results = pd.DataFrame( 45 | { 46 | "start": [0], 47 | "end": [3], 48 | "string_id": ["HelloWorld"], 49 | "span_text": ["Hello, world"], 50 | } 51 | ) 52 | assert_frame_equal(matches_dataframe, results) 53 | 54 | 55 | def test_matcher_dataframe_multiple_patterns(): 56 | dframcy_matcher.reset() 57 | dframcy_matcher.add( 58 | "Hello_World", 59 | [ 60 | [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], 61 | [{"LOWER": "hello"}, {"LOWER": "world"}], 62 | ], 63 | None, 64 | ) 65 | doc = dframcy_matcher.nlp("Hello, world! Hello world!") 66 | matches_dataframe = dframcy_matcher(doc) 67 | results = pd.DataFrame( 68 | { 69 | "start": [0, 4], 70 | "end": [3, 6], 71 | "string_id": ["Hello_World", "Hello_World"], 72 | "span_text": ["Hello, world", "Hello world"], 73 | } 74 | ) 75 | assert_frame_equal(matches_dataframe, results) 76 | 77 | 78 | def test_phrase_matcher(): 79 | patterns = [ 80 | dframcy_phrase_matcher.nlp.make_doc(name) 81 | for name in ["Angela Merkel", "Barack Obama"] 82 | ] 83 | dframcy_phrase_matcher.add("Names", patterns) 84 | doc = dframcy_phrase_matcher.nlp("angela merkel and us president barack Obama") 85 | spacy_phrase_matcher = dframcy_phrase_matcher.phrase_matcher 86 | matches = spacy_phrase_matcher(doc) 87 | assert matches[0][0] == 10631222085860127603 88 | assert matches[0][1] == 0 89 | assert matches[0][2] == 2 90 | assert doc[matches[0][1] : matches[0][2]].text == "angela merkel" 91 | assert doc[matches[1][1] : matches[1][2]].text == "barack Obama" 92 | 93 | dframcy_phrase_matcher.remove("Names") 94 | assert "Names" not in dframcy_phrase_matcher.phrase_matcher 95 | 96 | 97 | def test_phrase_matcher_dataframe(): 98 | dframcy_phrase_matcher.reset() 99 | terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] 100 | patterns = [dframcy_phrase_matcher.nlp.make_doc(text) for text in terms] 101 | dframcy_phrase_matcher.add("TerminologyList", patterns) 102 | doc = dframcy_phrase_matcher.nlp( 103 | "German Chancellor Angela Merkel and US President Barack Obama " 104 | "converse in the Oval Office inside the White House in Washington, D.C." 105 | ) 106 | phrase_matches_dataframe = dframcy_phrase_matcher(doc) 107 | results = pd.DataFrame( 108 | { 109 | "start": [2, 7, 19], 110 | "end": [4, 9, 22], 111 | "span_text": ["Angela Merkel", "Barack Obama", "Washington, D.C."], 112 | } 113 | ) 114 | assert_frame_equal(phrase_matches_dataframe, results) 115 | 116 | 117 | def test_dependency_matcher(): 118 | pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}] 119 | dframcy_dependency_matcher.add("FOUNDED", [pattern]) 120 | doc = dframcy_dependency_matcher.nlp("Bill Gates founded Microsoft.") 121 | spacy_dependency_matcher = dframcy_dependency_matcher.dependency_matcher 122 | matches = spacy_dependency_matcher(doc) 123 | assert matches[0][0] == 4851363122962674176 124 | assert len(matches[0][1]) == 1 125 | assert matches[0][1][0] == 2 126 | assert "FOUNDED" in dframcy_dependency_matcher.dependency_matcher 127 | dframcy_dependency_matcher.remove("FOUNDED") 128 | assert "FOUNDED" not in dframcy_dependency_matcher.dependency_matcher 129 | 130 | 131 | def test_dependency_matcher_dataframe(): 132 | dframcy_dependency_matcher.reset() 133 | pattern = [{"RIGHT_ID": "founded_id", "RIGHT_ATTRS": {"ORTH": "founded"}}] 134 | dframcy_dependency_matcher.add("FOUNDED", [pattern]) 135 | doc = dframcy_dependency_matcher.nlp( 136 | "Bill Gates founded Microsoft. And Elon Musk founded SpaceX" 137 | ) 138 | dependency_matches_dataframe = dframcy_dependency_matcher(doc) 139 | results = pd.DataFrame( 140 | {"token_index": ["2", "8"], "token_text": ["founded", "founded"]} 141 | ) 142 | assert_frame_equal(dependency_matches_dataframe, results) 143 | -------------------------------------------------------------------------------- /tests/test_dframcy.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import os 5 | import json 6 | import spacy 7 | import pytest 8 | import pandas as pd 9 | from dframcy.dframcy import DframCy 10 | from pandas.testing import assert_frame_equal 11 | 12 | dframcy = DframCy(spacy.load("en_core_web_sm")) 13 | 14 | current_dir = os.path.dirname(os.path.realpath(__file__)) 15 | project_root = "/" + "/".join(current_dir.split("/")[1:-1]) 16 | data_dir = project_root + "/data" 17 | 18 | 19 | @pytest.mark.parametrize("text", ["I am here in USA."]) 20 | def test_nlp_pipeline(text): 21 | doc = dframcy.nlp(text) 22 | assert doc[0].text == "I" 23 | assert doc[0].tag_ == "PRP" 24 | assert doc[1].lemma_ == "be" 25 | 26 | 27 | @pytest.mark.parametrize("text", ["I am here in USA."]) 28 | def test_default_columns(text): 29 | doc = dframcy.nlp(text) 30 | dataframe = dframcy.to_dataframe(doc) 31 | results = pd.DataFrame( 32 | { 33 | "token_text": ["I", "am", "here", "in", "USA", "."], 34 | "token_start": [0, 2, 5, 10, 13, 16], 35 | "token_end": [1, 4, 9, 12, 16, 17], 36 | "token_pos_": ["PRON", "AUX", "ADV", "ADP", "PROPN", "PUNCT"], 37 | "token_tag_": ["PRP", "VBP", "RB", "IN", "NNP", "."], 38 | "token_dep_": ["nsubj", "ROOT", "advmod", "prep", "pobj", "punct"], 39 | "token_head": ["am", "am", "am", "here", "in", "am"], 40 | "token_ent_type_": ["", "", "", "", "GPE", ""], 41 | } 42 | ) 43 | assert_frame_equal(dataframe, results) 44 | 45 | 46 | @pytest.mark.parametrize("text", ["bright red apples on the tree"]) 47 | def test_unknown_column_value(text): 48 | doc = dframcy.nlp(text) 49 | dataframe = dframcy.to_dataframe(doc, columns=["id", "start", "end", "apple"]) 50 | results = pd.DataFrame( 51 | {"token_start": [0, 7, 11, 18, 21, 25], "token_end": [6, 10, 17, 20, 24, 29]} 52 | ) 53 | assert_frame_equal(dataframe, results) 54 | 55 | 56 | @pytest.mark.parametrize("text", ["I have an apple"]) 57 | def test_custom_attribute(text): 58 | from spacy.tokens import Token 59 | 60 | fruit_getter = lambda token: token.text in ("apple", "pear", "banana") 61 | Token.set_extension("is_fruit", getter=fruit_getter) 62 | doc = dframcy.nlp(text) 63 | dataframe = dframcy.to_dataframe( 64 | doc, columns=["id", "start", "end"], custom_attributes=["is_fruit"] 65 | ) 66 | results = pd.DataFrame( 67 | { 68 | "token_start": [0, 2, 7, 10], 69 | "token_end": [1, 6, 9, 15], 70 | "token_is_fruit": [False, False, False, True], 71 | } 72 | ) 73 | assert_frame_equal(dataframe, results) 74 | 75 | 76 | def test_all_columns_thoroughly(): 77 | doc = dframcy.nlp( 78 | "Machine learning is an application of artificial intelligence (AI) that provides systems the " 79 | "ability to automatically learn and improve from experience without being explicitly " 80 | "programmed. Machine learning focuses on the development of computer programs that can access " 81 | "data and use it learn for themselves." 82 | ) 83 | dataframe = dframcy.to_dataframe( 84 | doc, 85 | [ 86 | "id", 87 | "end", 88 | "pos", 89 | "tag", 90 | "dep", 91 | "text", 92 | "head", 93 | "pos_", 94 | "tag_", 95 | "dep_", 96 | "orth", 97 | "norm", 98 | "lang", 99 | "orth_", 100 | "norm_", 101 | "lang_", 102 | "lefts", 103 | "start", 104 | "lower", 105 | "shape", 106 | "lemma_", 107 | "lower_", 108 | "shape_", 109 | "is_oov", 110 | "rights", 111 | "ent_id", 112 | "prefix", 113 | "suffix", 114 | "ent_id_", 115 | "prefix_", 116 | "suffix_", 117 | "is_stop", 118 | "n_lefts", 119 | "subtree", 120 | "ent_iob", 121 | "ent_iob_", 122 | "is_alpha", 123 | "is_ascii", 124 | "is_digit", 125 | "is_lower", 126 | "is_upper", 127 | "is_title", 128 | "is_punct", 129 | "is_space", 130 | "is_quote", 131 | "like_url", 132 | "like_num", 133 | "children", 134 | "n_rights", 135 | "ent_type", 136 | "left_edge", 137 | "ent_type_", 138 | "ancestors", 139 | "conjuncts", 140 | "right_edge", 141 | "ent_kb_id_", 142 | "is_bracket", 143 | "like_email", 144 | "has_vector", 145 | "is_currency", 146 | "is_left_punct", 147 | "is_sent_start", 148 | "is_right_punct", 149 | ], 150 | ) 151 | 152 | assert dataframe.shape == (48, 62) 153 | assert dataframe["token_ancestors"][0] == "learning, is" 154 | assert (dataframe.token_is_lower).sum() == 41 155 | assert (~dataframe.token_is_lower).sum() == 7 156 | 157 | 158 | def test_entity_rule_dataframe(): 159 | dframcy_test_ent = DframCy(spacy.load("en_core_web_sm")) 160 | patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}] 161 | dframcy_test_ent.add_entity_ruler(patterns) 162 | doc = dframcy_test_ent.nlp("MyCorp Inc. is a company in the U.S.") 163 | _, entity_frame = dframcy_test_ent.to_dataframe(doc, separate_entity_dframe=True) 164 | results = pd.DataFrame( 165 | {"ent_text": ["MyCorp Inc.", "U.S."], "ent_label": ["ORG", "GPE"]} 166 | ) 167 | assert_frame_equal(entity_frame, results) 168 | 169 | 170 | def test_sentence_without_named_entities(): 171 | doc = dframcy.nlp("Autonomous cars shift insurance liability toward manufacturers.") 172 | dataframe = dframcy.to_dataframe(doc, ["pos_", "tag_", "ent_type_"]) 173 | 174 | assert "token_ent_type_" not in dataframe.columns 175 | -------------------------------------------------------------------------------- /dframcy/dframcy.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import pandas as pd 5 | 6 | from dframcy import utils 7 | 8 | 9 | class DframCy(object): 10 | """ 11 | Dataframe integration with spaCy's linguistic annotations. 12 | """ 13 | 14 | def __init__(self, nlp_pipeline): 15 | """ 16 | :param nlp_pipeline: nlp pipeline to be used (i.e. language model). 17 | """ 18 | self._nlp = nlp_pipeline 19 | 20 | @property 21 | def nlp(self): 22 | """ 23 | To get texted nlped 24 | :return: Spacy's Doc object 25 | """ 26 | return self._nlp 27 | 28 | @staticmethod 29 | def get_token_attribute_value(token, attribute_name, _type): 30 | """ 31 | To get value of specific attribute of spacy's Token class 32 | :param token: token object of class Token 33 | :param attribute_name: name attribute for which value is required 34 | :param _type: type of class attribute (property, attribute) 35 | :retrun: attribute value 36 | """ 37 | if _type == "attribute" or _type == "int_format_attribute": 38 | value = getattr(token, attribute_name) 39 | if attribute_name in ["head", "left_edge", "right_edge"]: 40 | return value.text 41 | else: 42 | return value 43 | elif _type == "property": 44 | value = getattr(token, attribute_name) 45 | if attribute_name in ["n_lefts", "n_rights", "has_vector", "is_sent_start"]: 46 | return value 47 | else: 48 | return ", ".join([v.text for v in value]) 49 | elif _type == "additional_attribute": 50 | if attribute_name == "id": 51 | return getattr(token, "i") 52 | elif attribute_name == "start": 53 | return getattr(token, "idx") 54 | elif attribute_name == "end": 55 | return getattr(token, "idx") + len(token) 56 | elif _type == "custom_attributes": 57 | return getattr(getattr(token, "_"), attribute_name) 58 | 59 | def get_token_attribute_dict(self, doc, consistent_columns): 60 | """ 61 | To get attribute dictionary for sequence of Token object in Doc 62 | :param doc: Doc object 63 | :param consistent_columns: name attributes required with its type 64 | :return: python dictionary containing attributes names as keys 65 | and list of all token values as value. 66 | """ 67 | token_attribute_dictionary = {} 68 | for token in doc: 69 | for column_name in consistent_columns: 70 | if column_name[0] in token_attribute_dictionary: 71 | token_attribute_dictionary[column_name[0]].append( 72 | self.get_token_attribute_value( 73 | token, column_name[0], column_name[1] 74 | ) 75 | ) 76 | else: 77 | token_attribute_dictionary[column_name[0]] = [] 78 | token_attribute_dictionary[column_name[0]].append( 79 | self.get_token_attribute_value( 80 | token, column_name[0], column_name[1] 81 | ) 82 | ) 83 | return token_attribute_dictionary 84 | 85 | @staticmethod 86 | def get_named_entity_dict(doc): 87 | """ 88 | To get named entities from NLP processed text 89 | :param doc: spacy container for linguistic annotations. 90 | :return: dictionary containing entity_text and entity_label 91 | """ 92 | entity_details_dict = {"ent_text": [], "ent_label": []} 93 | for ent in doc.ents: 94 | entity_details_dict["ent_text"].append(ent.text) 95 | entity_details_dict["ent_label"].append(ent.label_) 96 | return entity_details_dict 97 | 98 | def to_dataframe( 99 | self, doc, columns=None, separate_entity_dframe=False, custom_attributes=None 100 | ): 101 | """ 102 | Convert Linguistic annotations for text into pandas dataframe 103 | :param doc: spacy container for linguistic annotations. 104 | :param columns: list of str, name of columns to be included in dataframe (default: 105 | ["id", "text", "start", "end", "pos_", "tag_", "dep_", "head", "ent_type_"]) 106 | :param separate_entity_dframe: bool, for separate entity dataframe (default: False) 107 | :param custom_attributes: list, for custom attribute 108 | :return: dataframe, dataframe containing linguistic annotations 109 | """ 110 | if columns is None: 111 | columns = utils.get_default_columns() 112 | 113 | if "id" not in columns: 114 | columns = ["id"] + columns 115 | 116 | consistent_columns = utils.check_columns_consistency(columns) 117 | 118 | if custom_attributes: 119 | consistent_columns += [ 120 | (attr, "custom_attributes") for attr in custom_attributes 121 | ] 122 | 123 | token_attribute_dictionary = self.get_token_attribute_dict( 124 | doc, consistent_columns 125 | ) 126 | tokens_dataframe = pd.DataFrame.from_dict(token_attribute_dictionary) 127 | 128 | new_column_names_map = {i: "token_" + i for i in tokens_dataframe.columns} 129 | 130 | tokens_dataframe.rename(columns=new_column_names_map, inplace=True) 131 | 132 | tokens_dataframe.reindex(tokens_dataframe["token_id"]) 133 | 134 | tokens_dataframe.drop(columns=["token_id"], inplace=True) 135 | 136 | if not doc.ents and "token_ent_type_" in tokens_dataframe.columns: 137 | tokens_dataframe.drop(columns=["token_ent_type_"], inplace=True) 138 | 139 | if separate_entity_dframe: 140 | entity_dict = self.get_named_entity_dict(doc) 141 | entity_dataframe = pd.DataFrame.from_dict(entity_dict) 142 | 143 | return ( 144 | tokens_dataframe 145 | if not separate_entity_dframe 146 | else (tokens_dataframe, entity_dataframe) 147 | ) 148 | 149 | def add_entity_ruler(self, patterns): 150 | """ 151 | To add entity ruler in nlp pipeline 152 | official doc: https://spacy.io/api/entityruler 153 | :param patterns: list or list of lists of token/phrase based patterns 154 | """ 155 | ruler = self._nlp.add_pipe("entity_ruler") 156 | ruler.add_patterns(patterns) 157 | -------------------------------------------------------------------------------- /dframcy/matcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import pandas as pd 5 | from spacy.matcher import Matcher 6 | from spacy.matcher import PhraseMatcher 7 | from spacy.matcher import DependencyMatcher 8 | 9 | 10 | class DframCyMatcher(object): 11 | """ 12 | Dataframe wrapper class over spaCy's Matcher 13 | https://spacy.io/api/matcher 14 | """ 15 | 16 | def __init__(self, nlp_pipeline, validate=False): 17 | """ 18 | :param nlp_pipeline: nlp pipeline to be used (i.e. language model). 19 | """ 20 | self._nlp = nlp_pipeline 21 | self._matcher = None 22 | self.validate = validate 23 | 24 | @property 25 | def nlp(self): 26 | return self._nlp 27 | 28 | @property 29 | def matcher(self): 30 | return self._matcher 31 | 32 | def __call__(self, doc): 33 | """ 34 | To find all token sequences matching the supplied patterns on the Doc 35 | :param doc: spacy container for linguistic annotations. 36 | :return: dataframe, containing matched occurrences. 37 | """ 38 | df_format_json = {} 39 | matches = self._matcher(doc) 40 | for match_id, start, end in matches: 41 | if "match_id" not in df_format_json: 42 | df_format_json["match_id"] = [] 43 | df_format_json["match_id"].append(match_id) 44 | else: 45 | df_format_json["match_id"].append(match_id) 46 | if "start" not in df_format_json: 47 | df_format_json["start"] = [] 48 | df_format_json["start"].append(start) 49 | else: 50 | df_format_json["start"].append(start) 51 | if "end" not in df_format_json: 52 | df_format_json["end"] = [] 53 | df_format_json["end"].append(end) 54 | else: 55 | df_format_json["end"].append(end) 56 | if "string_id" not in df_format_json: 57 | df_format_json["string_id"] = [] 58 | df_format_json["string_id"].append(self._nlp.vocab.strings[match_id]) 59 | else: 60 | df_format_json["string_id"].append(self._nlp.vocab.strings[match_id]) 61 | if "span_text" not in df_format_json: 62 | df_format_json["span_text"] = [] 63 | df_format_json["span_text"].append(doc[start:end].text) 64 | else: 65 | df_format_json["span_text"].append(doc[start:end].text) 66 | matches_dataframe = pd.DataFrame.from_dict(df_format_json) 67 | matches_dataframe.reindex(matches_dataframe["match_id"]) 68 | matches_dataframe.drop(columns=["match_id"], inplace=True) 69 | 70 | return matches_dataframe 71 | 72 | def get_matcher(self): 73 | """ 74 | To initialize spaCy's matcher class object. 75 | :return: Matcher object 76 | """ 77 | return Matcher(self._nlp.vocab, validate=self.validate) 78 | 79 | def add(self, pattern_name, patterns, callback=None): 80 | """ 81 | To add patterns to spaCy's matcher object 82 | :param pattern_name: str, pattern name 83 | :param patterns: list of patterns 84 | :param callback: function, callback function to be invoked on matched occurrences. 85 | """ 86 | if not self._matcher: 87 | self._matcher = self.get_matcher() 88 | self._matcher.add(pattern_name, patterns, on_match=callback) 89 | 90 | def remove(self, pattern_name): 91 | """ 92 | To remove pattern from spaCy's matcher object 93 | :param pattern_name: str, pattern_name 94 | """ 95 | if self._matcher: 96 | self._matcher.remove(pattern_name) 97 | 98 | def reset(self): 99 | """ 100 | To re-initialize spaCy's matcher object 101 | """ 102 | self._matcher = self.get_matcher() 103 | 104 | 105 | class DframCyPhraseMatcher(object): 106 | """ 107 | Dataframe wrapper class over spaCy's PhraseMatcher 108 | https://spacy.io/api/phrasematcher 109 | """ 110 | 111 | def __init__(self, nlp_pipeline, attr="ORTH", validate=False): 112 | """ 113 | :param nlp_pipeline: nlp pipeline to be used (i.e. language model). 114 | :param attr: str, token attribute to match on (default: "ORTH") 115 | """ 116 | self._nlp = nlp_pipeline 117 | self._phrase_matcher = None 118 | self.attribute = attr 119 | self.validate = validate 120 | 121 | @property 122 | def nlp(self): 123 | return self._nlp 124 | 125 | @property 126 | def phrase_matcher(self): 127 | return self._phrase_matcher 128 | 129 | def __call__(self, doc): 130 | """ 131 | To find all token sequences matching the supplied patterns on the Doc 132 | :param doc: spacy container for linguistic annotations. 133 | :return: dataframe, containing matched occurrences. 134 | """ 135 | df_format_json = {} 136 | phrase_matches = self._phrase_matcher(doc) 137 | for match_id, start, end in phrase_matches: 138 | if "match_id" not in df_format_json: 139 | df_format_json["match_id"] = [] 140 | df_format_json["match_id"].append(match_id) 141 | else: 142 | df_format_json["match_id"].append(match_id) 143 | if "start" not in df_format_json: 144 | df_format_json["start"] = [] 145 | df_format_json["start"].append(start) 146 | else: 147 | df_format_json["start"].append(start) 148 | if "end" not in df_format_json: 149 | df_format_json["end"] = [] 150 | df_format_json["end"].append(end) 151 | else: 152 | df_format_json["end"].append(end) 153 | if "span_text" not in df_format_json: 154 | df_format_json["span_text"] = [] 155 | df_format_json["span_text"].append(doc[start:end].text) 156 | else: 157 | df_format_json["span_text"].append(doc[start:end].text) 158 | phrase_matches_dataframe = pd.DataFrame.from_dict(df_format_json) 159 | phrase_matches_dataframe.reindex(phrase_matches_dataframe["match_id"]) 160 | phrase_matches_dataframe.drop(columns=["match_id"], inplace=True) 161 | 162 | return phrase_matches_dataframe 163 | 164 | def get_phrase_matcher(self): 165 | """ 166 | To get spaCy's phrase matcher class object (used for testing only). 167 | :return: phrase matcher object 168 | """ 169 | return PhraseMatcher( 170 | self._nlp.vocab, attr=self.attribute, validate=self.validate 171 | ) 172 | 173 | def add(self, pattern_name, patterns, callback=None): 174 | """ 175 | To add patterns to spaCy's phrase matcher object 176 | :param pattern_name: str, pattern name 177 | :param pattern: list of patterns 178 | :param callback: function, callback function to be invoked on matched occurrences. 179 | """ 180 | if not self._phrase_matcher: 181 | self._phrase_matcher = self.get_phrase_matcher() 182 | self._phrase_matcher.add(pattern_name, patterns, on_match=callback) 183 | 184 | def remove(self, pattern_name): 185 | """ 186 | To remove pattern from spaCy's matcher object 187 | :param pattern_name: str, pattern_name 188 | """ 189 | if self._phrase_matcher: 190 | self._phrase_matcher.remove(pattern_name) 191 | 192 | def reset(self, change_attribute=None): 193 | """ 194 | To re-initialize spaCy's phrase matcher object 195 | :param change_attribute: token attribute to match on 196 | """ 197 | if change_attribute: 198 | self.attribute = change_attribute 199 | self._phrase_matcher = self.get_phrase_matcher() 200 | 201 | 202 | class DframCyDependencyMatcher(object): 203 | """ 204 | Dataframe wrapper class over spaCy's DependencyMatcher 205 | https://spacy.io/api/dependencymatcher 206 | """ 207 | 208 | def __init__(self, nlp_pipeline, validate=False): 209 | """ 210 | :param nlp_pipeline: nlp pipeline to be used (i.e. language model). 211 | :param validate: bool, performs validation on petterns (default: false) 212 | """ 213 | self._nlp = nlp_pipeline 214 | self._dependency_matcher = None 215 | self.validate = validate 216 | 217 | @property 218 | def nlp(self): 219 | return self._nlp 220 | 221 | @property 222 | def dependency_matcher(self): 223 | return self._dependency_matcher 224 | 225 | def __call__(self, doc): 226 | """ 227 | To find all token sequences matching the supplied patterns on the Doc 228 | :param doc: spacy container for linguistic annotations. 229 | :return: dataframe, containing matched occurrences. 230 | """ 231 | df_format_json = {} 232 | dependency_matches = self._dependency_matcher(doc) 233 | for match_id, matched_token_indices in dependency_matches: 234 | if "match_id" not in df_format_json: 235 | df_format_json["match_id"] = [] 236 | df_format_json["match_id"].append(match_id) 237 | else: 238 | df_format_json["match_id"].append(match_id) 239 | if "token_index" not in df_format_json: 240 | df_format_json["token_index"] = [] 241 | df_format_json["token_index"].append( 242 | ", ".join([str(i) for i in matched_token_indices]) 243 | ) 244 | else: 245 | df_format_json["token_index"].append( 246 | ", ".join([str(i) for i in matched_token_indices]) 247 | ) 248 | if "token_text" not in df_format_json: 249 | df_format_json["token_text"] = [] 250 | df_format_json["token_text"].append( 251 | ", ".join([doc[i].text for i in matched_token_indices]) 252 | ) 253 | else: 254 | df_format_json["token_text"].append( 255 | ", ".join([doc[i].text for i in matched_token_indices]) 256 | ) 257 | dependency_matches_dataframe = pd.DataFrame.from_dict(df_format_json) 258 | dependency_matches_dataframe.reindex(dependency_matches_dataframe["match_id"]) 259 | dependency_matches_dataframe.drop(columns=["match_id"], inplace=True) 260 | 261 | return dependency_matches_dataframe 262 | 263 | def get_dependency_matcher(self): 264 | """ 265 | To get spaCy's dependency matcher class object (used for testing only). 266 | :return: dependency matcher object 267 | """ 268 | return DependencyMatcher(self._nlp.vocab, validate=self.validate) 269 | 270 | def add(self, pattern_name, patterns, callback=None): 271 | """ 272 | To add patterns to spaCy's dependency matcher object 273 | :param pattern_name: str, pattern name 274 | :param pattern: list of patterns 275 | :param callback: function, callback function to be invoked on matched occurrences. 276 | """ 277 | if not self._dependency_matcher: 278 | self._dependency_matcher = self.get_dependency_matcher() 279 | self._dependency_matcher.add(pattern_name, patterns, on_match=callback) 280 | 281 | def remove(self, pattern_name): 282 | """ 283 | To remove pattern from spaCy's matcher object 284 | :param pattern_name: str, pattern_name 285 | """ 286 | if self._dependency_matcher: 287 | self._dependency_matcher.remove(pattern_name) 288 | 289 | def reset(self): 290 | """ 291 | To re-initialize spaCy's dependency matcher object 292 | :param change_attribute: token attribute to match on 293 | """ 294 | self._dependency_matcher = self.get_dependency_matcher() 295 | --------------------------------------------------------------------------------