├── tests ├── data │ └── test.dat ├── __init__.py ├── test_client.py ├── test_protobuf.py ├── test_annotator.py └── happyfuntokenizer.py ├── CHANGELOG ├── MANIFEST.in ├── setup.cfg ├── .travis.yml ├── .github └── ISSUE_TEMPLATE │ └── use-stanza-instead.md ├── corenlp ├── __init__.py ├── main.py ├── annotator.py └── client.py ├── tox.ini ├── LICENSE ├── .gitignore ├── setup.py ├── README.rst └── doc └── CoreNLP.proto /tests/data/test.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/python-stanford-corenlp/HEAD/tests/data/test.dat -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 3.7.1 - Fixed some bugs with tests for stanford-corenlp and renamed to stanford-corenlp 2 | 3.7.0 - Initial release 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include *.rst 3 | include LICENSE 4 | include CHANGELOG 5 | 6 | # Include the data files 7 | recursive-include corenlp *.py 8 | recursive-include doc *.proto 9 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # the inclusion of the tests module is not meant to offer best practices for 2 | # testing in general, but rather to support the `find_packages` example in 3 | # setup.py that excludes installing the "tests" package 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | - 3.5 5 | notifications: 6 | email: false 7 | before_install: 8 | - sudo apt-get update 9 | install: 10 | - pip install tox-travis 11 | # Run test 12 | script: 13 | - tox 14 | # only integrate the master branch 15 | branches: 16 | only: 17 | - master 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/use-stanza-instead.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Use stanza instead 3 | about: Stop using this repo 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | This repo is no longer supported. Please use stanza instead. 11 | 12 | https://github.com/stanfordnlp/stanza 13 | 14 | If this new issue is not completely ignoring, the only response you will get will be to use stanza instead. 15 | -------------------------------------------------------------------------------- /corenlp/__init__.py: -------------------------------------------------------------------------------- 1 | from corenlp_protobuf import to_text 2 | from corenlp_protobuf import Document, Sentence, Token, IndexedWord, Span 3 | from corenlp_protobuf import ParseTree, DependencyGraph, CorefChain 4 | from corenlp_protobuf import Mention, NERMention, Entity, Relation, RelationTriple, Timex 5 | from corenlp_protobuf import Quote, SpeakerInfo 6 | from corenlp_protobuf import Operator, Polarity 7 | from corenlp_protobuf import SentenceFragment, TokenLocation 8 | from corenlp_protobuf import MapStringString, MapIntString 9 | from .client import CoreNLPClient, AnnotationException, TimeoutException 10 | from .annotator import Annotator 11 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # this file is *not* meant to cover or endorse the use of tox or pytest or 2 | # testing in general, 3 | # 4 | # It's meant to show the use of: 5 | # 6 | # - check-manifest 7 | # confirm items checked into vcs are in your sdist 8 | # - python setup.py check (using the readme_renderer extension) 9 | # confirms your long_description will render correctly on pypi 10 | # 11 | # and also to help confirm pull requests to this project. 12 | 13 | [tox] 14 | envlist = py{27,35,36} 15 | 16 | [testenv] 17 | basepython = 18 | py27: python2.7 19 | py35: python3.5 20 | py36: python3.6 21 | deps = 22 | check-manifest 23 | readme_renderer 24 | pytest 25 | requests 26 | protobuf 27 | commands = 28 | check-manifest --ignore tox.ini,tests* 29 | python setup.py check -m -r -s 30 | py.test tests/test_protobuf.py 31 | py.test tests/test_annotator.py -k "annotator_" 32 | [flake8] 33 | exclude = .tox,*.egg,build,data 34 | select = E,W,F 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Stanford NLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests that call a running CoreNLPClient. 3 | """ 4 | import corenlp 5 | 6 | TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n" 7 | 8 | def test_connect(): 9 | with corenlp.CoreNLPClient() as client: 10 | client.ensure_alive() 11 | assert client.is_active 12 | assert client.is_alive() 13 | 14 | def test_annotate(): 15 | with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client: 16 | ann = client.annotate(TEXT) 17 | assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1] 18 | 19 | def test_update(): 20 | with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client: 21 | ann = client.annotate(TEXT) 22 | ann = client.update(ann) 23 | assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1] 24 | 25 | def test_tokensregex(): 26 | with corenlp.CoreNLPClient(annotators='tokenize ssplit ner depparse'.split(), timeout=60000) as client: 27 | # Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml 28 | pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/' 29 | matches = client.tokensregex(TEXT, pattern) 30 | assert len(matches["sentences"]) == 1 31 | assert matches["sentences"][0]["length"] == 1 32 | assert matches == { 33 | "sentences": [{ 34 | "0": { 35 | "text": "Chris wrote a simple sentence", 36 | "begin": 0, 37 | "end": 5, 38 | "1": { 39 | "text": "Chris", 40 | "begin": 0, 41 | "end": 1 42 | }}, 43 | "length": 1 44 | },]} 45 | 46 | def test_semgrex(): 47 | with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma ner depparse'.split(), timeout=60000) as client: 48 | pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object' 49 | matches = client.semgrex(TEXT, pattern, to_words=True) 50 | assert matches == [ 51 | { 52 | "text": "wrote", 53 | "begin": 1, 54 | "end": 2, 55 | "$subject": { 56 | "text": "Chris", 57 | "begin": 0, 58 | "end": 1 59 | }, 60 | "$object": { 61 | "text": "sentence", 62 | "begin": 4, 63 | "end": 5 64 | }, 65 | "sentence": 0,}] 66 | -------------------------------------------------------------------------------- /corenlp/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Simple shell program to pipe in 5 | """ 6 | 7 | import corenlp 8 | 9 | import json 10 | import re 11 | import csv 12 | import sys 13 | from collections import namedtuple, OrderedDict 14 | 15 | FLOAT_RE = re.compile(r"\d*\.\d+") 16 | INT_RE = re.compile(r"\d+") 17 | 18 | def dictstr(arg): 19 | """ 20 | Parse a key=value string as a tuple (key, value) that can be provided as an argument to dict() 21 | """ 22 | key, value = arg.split("=") 23 | 24 | if value.lower() == "true" or value.lower() == "false": 25 | value = bool(value) 26 | elif INT_RE.match(value): 27 | value = int(value) 28 | elif FLOAT_RE.match(value): 29 | value = float(value) 30 | return (key, value) 31 | 32 | 33 | def do_annotate(args): 34 | args.props = dict(args.props) if args.props else {} 35 | if args.sentence_mode: 36 | args.props["ssplit.isOneSentence"] = True 37 | 38 | with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client: 39 | for line in args.input: 40 | if line.startswith("#"): continue 41 | 42 | ann = client.annotate(line.strip(), output_format=args.format) 43 | 44 | if args.format == "json": 45 | if args.sentence_mode: 46 | ann = ann["sentences"][0] 47 | 48 | args.output.write(json.dumps(ann)) 49 | args.output.write("\n") 50 | 51 | def main(): 52 | import argparse 53 | parser = argparse.ArgumentParser(description='Annotate data') 54 | parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file to process; each line contains one document (default: stdin)") 55 | parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="File to write annotations to (default: stdout)") 56 | parser.add_argument('-f', '--format', choices=["json",], default="json", help="Output format") 57 | parser.add_argument('-a', '--annotators', nargs="+", type=str, default=["tokenize ssplit lemma pos"], help="A list of annotators") 58 | parser.add_argument('-s', '--sentence-mode', action="store_true",help="Assume each line of input is a sentence.") 59 | parser.add_argument('-v', '--verbose-server', action="store_true",help="Server is made verbose") 60 | parser.add_argument('-m', '--memory', type=str, default="4G", help="Memory to use for the server") 61 | parser.add_argument('-p', '--props', nargs="+", type=dictstr, help="Properties as a list of key=value pairs") 62 | parser.set_defaults(func=do_annotate) 63 | 64 | ARGS = parser.parse_args() 65 | if ARGS.func is None: 66 | parser.print_help() 67 | sys.exit(1) 68 | else: 69 | ARGS.func(ARGS) 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | 3 | See: 4 | https://packaging.python.org/en/latest/distributing.html 5 | https://github.com/pypa/sampleproject 6 | """ 7 | 8 | # Always prefer setuptools over distutils 9 | from setuptools import setup, find_packages 10 | # To use a consistent encoding 11 | from codecs import open 12 | from os import path 13 | 14 | here = path.abspath(path.dirname(__file__)) 15 | 16 | # Get the long description from the README file 17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 18 | long_description = f.read() 19 | 20 | setup( 21 | name='stanford-corenlp', 22 | 23 | # Versions should comply with PEP440. For a discussion on single-sourcing 24 | # the version across setup.py and the project code, see 25 | # https://packaging.python.org/en/latest/single_source_version.html 26 | version='3.9.2', 27 | 28 | description='Official python interface for Stanford CoreNLP', 29 | long_description=long_description, 30 | 31 | # The project's main homepage. 32 | url='https://github.com/stanfordnlp/python-stanford-corenlp', 33 | 34 | # Author details 35 | author='Stanford NLP Group', 36 | author_email='chaganty@cs.stanford.edu', 37 | 38 | # Choose your license 39 | license='MIT', 40 | 41 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 42 | classifiers=[ 43 | # How mature is this project? Common values are 44 | # 3 - Alpha 45 | # 4 - Beta 46 | # 5 - Production/Stable 47 | 'Development Status :: 4 - Beta', 48 | 49 | # Indicate who your project is intended for 50 | 'Intended Audience :: Developers', 51 | 'Topic :: Software Development :: Object Brokering', 52 | 53 | # Pick your license as you wish (should match "license" above) 54 | 'License :: OSI Approved :: MIT License', 55 | 56 | # Specify the Python versions you support here. In particular, ensure 57 | # that you indicate whether you support Python 2, Python 3 or both. 58 | 'Programming Language :: Python :: 2', 59 | 'Programming Language :: Python :: 2.7', 60 | 'Programming Language :: Python :: 3', 61 | 'Programming Language :: Python :: 3.3', 62 | 'Programming Language :: Python :: 3.4', 63 | 'Programming Language :: Python :: 3.5', 64 | ], 65 | 66 | # What does your project relate to? 67 | keywords='corenlp natural-language-processing nlp', 68 | 69 | # You can just specify the packages manually here if your project is 70 | # simple. Or you can use find_packages(). 71 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 72 | 73 | # Alternatively, if you want to distribute just a my_module.py, uncomment 74 | # this: 75 | #py_modules=["corenlp_protobuf"], 76 | 77 | # List run-time dependencies here. These will be installed by pip when 78 | # your project is installed. For an analysis of "install_requires" vs pip's 79 | # requirements files see: 80 | # https://packaging.python.org/en/latest/requirements.html 81 | install_requires=['corenlp-protobuf >= 3.8.0', 'requests >= 2.10.0', 'six >= 1.9'], 82 | 83 | # List additional groups of dependencies here (e.g. development 84 | # dependencies). You can install these using the following syntax, 85 | # for example: 86 | # $ pip install -e .[dev,test] 87 | extras_require={ 88 | 'dev': ['check-manifest'], 89 | 'test': ['coverage'], 90 | }, 91 | 92 | # If there are data files included in your packages that need to be 93 | # installed, specify them here. If using Python 2.6 or less, then these 94 | # have to be included in MANIFEST.in as well. 95 | package_data={ 96 | }, 97 | 98 | # Although 'package_data' is the preferred approach, in some case you may 99 | # need to place data files outside of your packages. See: 100 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 101 | # In this case, 'data_file' will be installed into '/my_data' 102 | data_files=[], 103 | 104 | # To provide executable scripts, use entry points in preference to the 105 | # "scripts" keyword. Entry points provide cross-platform support and allow 106 | # pip to create the appropriate form of executable for the target platform. 107 | entry_points={ 108 | 'console_scripts': [ 109 | 'annotate = corenlp.main:main', 110 | ], 111 | }, 112 | ) 113 | -------------------------------------------------------------------------------- /tests/test_protobuf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests to read a stored protobuf. 3 | Also serves as an example of how to parse sentences, tokens, pos, lemma, 4 | ner, dependencies and mentions. 5 | 6 | The test corresponds to annotations for the following sentence: 7 | Chris wrote a simple sentence that he parsed with Stanford CoreNLP. 8 | """ 9 | 10 | import os 11 | from pytest import fixture 12 | from corenlp_protobuf import Document, Sentence, Token, DependencyGraph,\ 13 | CorefChain 14 | from corenlp_protobuf import parseFromDelimitedString, writeToDelimitedString, to_text 15 | 16 | 17 | # Thext that was annotated 18 | TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n" 19 | 20 | 21 | @fixture 22 | def doc_pb(): 23 | test_dir = os.path.dirname(os.path.abspath(__file__)) 24 | test_data = os.path.join(test_dir, 'data', 'test.dat') 25 | with open(test_data, 'rb') as f: 26 | buf = f.read() 27 | doc = Document() 28 | parseFromDelimitedString(doc, buf) 29 | return doc 30 | 31 | def test_parse_protobuf(doc_pb): 32 | assert doc_pb.ByteSize() == 4239 33 | 34 | def test_write_protobuf(doc_pb): 35 | stream = writeToDelimitedString(doc_pb) 36 | buf = stream.getvalue() 37 | stream.close() 38 | 39 | doc_pb_ = Document() 40 | parseFromDelimitedString(doc_pb_, buf) 41 | assert doc_pb == doc_pb_ 42 | 43 | def test_document_text(doc_pb): 44 | assert doc_pb.text == TEXT 45 | 46 | 47 | def test_sentences(doc_pb): 48 | assert len(doc_pb.sentence) == 1 49 | 50 | sentence = doc_pb.sentence[0] 51 | assert isinstance(sentence, Sentence) 52 | # check sentence length 53 | assert sentence.characterOffsetEnd - sentence.characterOffsetBegin == 67 54 | # Note that the sentence text should actually be recovered from the tokens. 55 | assert sentence.text == '' 56 | assert to_text(sentence) == TEXT[:-1] 57 | 58 | 59 | def test_tokens(doc_pb): 60 | sentence = doc_pb.sentence[0] 61 | tokens = sentence.token 62 | assert len(tokens) == 12 63 | assert isinstance(tokens[0], Token) 64 | 65 | # Word 66 | words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split() 67 | words_ = [t.word for t in tokens] 68 | assert words_ == words 69 | 70 | # Lemma 71 | lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split() 72 | lemmas_ = [t.lemma for t in tokens] 73 | assert lemmas_ == lemmas 74 | 75 | # POS 76 | pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split() 77 | pos_ = [t.pos for t in tokens] 78 | assert pos_ == pos 79 | 80 | # NER 81 | ner = "PERSON O O O O O O O O ORGANIZATION O O".split() 82 | ner_ = [t.ner for t in tokens] 83 | assert ner_ == ner 84 | 85 | # character offsets 86 | begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()] 87 | end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()] 88 | begin_ = [t.beginChar for t in tokens] 89 | end_ = [t.endChar for t in tokens] 90 | assert begin_ == begin 91 | assert end_ == end 92 | 93 | 94 | def test_dependency_parse(doc_pb): 95 | """ 96 | Extract the dependency parse from the annotation. 97 | """ 98 | sentence = doc_pb.sentence[0] 99 | 100 | # You can choose from the following types of dependencies. 101 | # In general, you'll want enhancedPlusPlus 102 | assert sentence.basicDependencies.ByteSize() > 0 103 | assert sentence.enhancedDependencies.ByteSize() > 0 104 | assert sentence.enhancedPlusPlusDependencies.ByteSize() > 0 105 | 106 | tree = sentence.enhancedPlusPlusDependencies 107 | isinstance(tree, DependencyGraph) 108 | # Indices are 1-indexd with 0 being the "pseudo root" 109 | assert tree.root # 'wrote' is the root. == [2] 110 | # There are as many nodes as there are tokens. 111 | assert len(tree.node) == len(sentence.token) 112 | 113 | # Enhanced++ depdencies often contain additional edges and are 114 | # not trees -- here, 'parsed' would also have an edge to 115 | # 'sentence' 116 | assert len(tree.edge) == 12 117 | 118 | # This edge goes from "wrote" to "Chirs" 119 | edge = tree.edge[0] 120 | assert edge.source == 2 121 | assert edge.target == 1 122 | assert edge.dep == "nsubj" 123 | 124 | 125 | def test_coref_chain(doc_pb): 126 | """ 127 | Extract the corefence chains from the annotation. 128 | """ 129 | # Coreference chains span sentences and are stored in the 130 | # document. 131 | chains = doc_pb.corefChain 132 | 133 | # In this document there is 1 chain with Chris and he. 134 | assert len(chains) == 1 135 | chain = chains[0] 136 | assert isinstance(chain, CorefChain) 137 | assert chain.mention[0].beginIndex == 0 # 'Chris' 138 | assert chain.mention[0].endIndex == 1 139 | assert chain.mention[0].gender == "MALE" 140 | 141 | assert chain.mention[1].beginIndex == 6 # 'he' 142 | assert chain.mention[1].endIndex == 7 143 | assert chain.mention[1].gender == "MALE" 144 | 145 | assert chain.representative == 0 # Head of the chain is 'Chris' 146 | -------------------------------------------------------------------------------- /corenlp/annotator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines a base class that can be used to annotate. 3 | """ 4 | import io 5 | from multiprocessing import Process 6 | from six.moves.BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 7 | from six.moves import http_client as HTTPStatus 8 | 9 | from corenlp_protobuf import Document, parseFromDelimitedString, writeToDelimitedString 10 | 11 | class Annotator(Process): 12 | """ 13 | This annotator base class hosts a lightweight server that accepts 14 | annotation requests from CoreNLP. 15 | Each annotator simply defines 3 functions: requires, provides and annotate. 16 | 17 | This class takes care of defining appropriate endpoints to interface 18 | with CoreNLP. 19 | """ 20 | @property 21 | def name(self): 22 | """ 23 | Name of the annotator (used by CoreNLP) 24 | """ 25 | raise NotImplementedError() 26 | 27 | @property 28 | def requires(self): 29 | """ 30 | Requires has to specify all the annotations required before we 31 | are called. 32 | """ 33 | raise NotImplementedError() 34 | 35 | @property 36 | def provides(self): 37 | """ 38 | The set of annotations guaranteed to be provided when we are done. 39 | NOTE: that these annotations are either fully qualified Java 40 | class names or refer to nested classes of 41 | edu.stanford.nlp.ling.CoreAnnotations (as is the case below). 42 | """ 43 | raise NotImplementedError() 44 | 45 | def annotate(self, ann): 46 | """ 47 | @ann: is a protobuf annotation object. 48 | Actually populate @ann with tokens. 49 | """ 50 | raise NotImplementedError() 51 | 52 | @property 53 | def properties(self): 54 | """ 55 | Defines a Java property to define this anntoator to CoreNLP. 56 | """ 57 | return { 58 | "customAnnotatorClass.{}".format(self.name): "edu.stanford.nlp.pipeline.GenericWebServiceAnnotator", 59 | "generic.endpoint": "http://{}:{}".format(self.host, self.port), 60 | "generic.requires": ",".join(self.requires), 61 | "generic.provides": ",".join(self.provides), 62 | } 63 | 64 | class _Handler(BaseHTTPRequestHandler): 65 | annotator = None 66 | 67 | def __init__(self, request, client_address, server): 68 | BaseHTTPRequestHandler.__init__(self, request, client_address, server) 69 | 70 | def do_GET(self): 71 | """ 72 | Handle a ping request 73 | """ 74 | if not self.path.endswith("/"): self.path += "/" 75 | if self.path == "/ping/": 76 | msg = "pong".encode("UTF-8") 77 | 78 | self.send_response(HTTPStatus.OK) 79 | self.send_header("Content-Type", "text/application") 80 | self.send_header("Content-Length", len(msg)) 81 | self.end_headers() 82 | self.wfile.write(msg) 83 | else: 84 | self.send_response(HTTPStatus.BAD_REQUEST) 85 | self.end_headers() 86 | 87 | def do_POST(self): 88 | """ 89 | Handle an annotate request 90 | """ 91 | if not self.path.endswith("/"): self.path += "/" 92 | if self.path == "/annotate/": 93 | # Read message 94 | length = int(self.headers.get('content-length')) 95 | msg = self.rfile.read(length) 96 | 97 | # Do the annotation 98 | doc = Document() 99 | parseFromDelimitedString(doc, msg) 100 | self.annotator.annotate(doc) 101 | 102 | with io.BytesIO() as stream: 103 | writeToDelimitedString(doc, stream) 104 | msg = stream.getvalue() 105 | 106 | # write message 107 | self.send_response(HTTPStatus.OK) 108 | self.send_header("Content-Type", "application/x-protobuf") 109 | self.send_header("Content-Length", len(msg)) 110 | self.end_headers() 111 | self.wfile.write(msg) 112 | 113 | else: 114 | self.send_response(HTTPStatus.BAD_REQUEST) 115 | self.end_headers() 116 | 117 | def __init__(self, host="", port=8432): 118 | """ 119 | Launches a server endpoint to communicate with CoreNLP 120 | """ 121 | Process.__init__(self) 122 | self.host, self.port = host, port 123 | self._Handler.annotator = self 124 | 125 | def run(self): 126 | """ 127 | Runs the server using Python's simple HTTPServer. 128 | TODO: make this multithreaded. 129 | """ 130 | httpd = HTTPServer((self.host, self.port), self._Handler) 131 | sa = httpd.socket.getsockname() 132 | serve_message = "Serving HTTP on {host} port {port} (http://{host}:{port}/) ..." 133 | print(serve_message.format(host=sa[0], port=sa[1])) 134 | try: 135 | httpd.serve_forever() 136 | except KeyboardInterrupt: 137 | print("\nKeyboard interrupt received, exiting.") 138 | httpd.shutdown() 139 | -------------------------------------------------------------------------------- /tests/test_annotator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | A test annotator (tokens). 4 | """ 5 | import six 6 | 7 | import pytest 8 | import time 9 | import requests 10 | import corenlp 11 | from .happyfuntokenizer import Tokenizer 12 | 13 | class HappyFunTokenizer(Tokenizer, corenlp.Annotator): 14 | def __init__(self, preserve_case=False): 15 | Tokenizer.__init__(self, preserve_case) 16 | corenlp.Annotator.__init__(self) 17 | 18 | @property 19 | def name(self): 20 | """ 21 | Name of the annotator (used by CoreNLP) 22 | """ 23 | return "happyfun" 24 | 25 | @property 26 | def requires(self): 27 | """ 28 | Requires has to specify all the annotations required before we 29 | are called. 30 | """ 31 | return [] 32 | 33 | @property 34 | def provides(self): 35 | """ 36 | The set of annotations guaranteed to be provided when we are done. 37 | NOTE: that these annotations are either fully qualified Java 38 | class names or refer to nested classes of 39 | edu.stanford.nlp.ling.CoreAnnotations (as is the case below). 40 | """ 41 | return ["TextAnnotation", 42 | "TokensAnnotation", 43 | "TokenBeginAnnotation", 44 | "TokenEndAnnotation", 45 | "CharacterOffsetBeginAnnotation", 46 | "CharacterOffsetEndAnnotation", 47 | ] 48 | 49 | def annotate(self, ann): 50 | """ 51 | @ann: is a protobuf annotation object. 52 | Actually populate @ann with tokens. 53 | """ 54 | buf, beg_idx, end_idx = ann.text.lower(), 0, 0 55 | for i, word in enumerate(self.tokenize(ann.text)): 56 | token = ann.sentencelessToken.add() 57 | # These are the bare minimum required for the TokenAnnotation 58 | token.word = six.u(word) 59 | token.tokenBeginIndex = i 60 | token.tokenEndIndex = i+1 61 | 62 | # Seek into the txt until you can find this word. 63 | try: 64 | # Try to update beginning index 65 | beg_idx = buf.index(word, beg_idx) 66 | except ValueError: 67 | # Give up -- this will be something random 68 | end_idx = beg_idx + len(word) 69 | 70 | token.beginChar = beg_idx 71 | token.endChar = end_idx 72 | 73 | beg_idx, end_idx = end_idx, end_idx 74 | 75 | def test_annotator_annotate(): 76 | cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)", 77 | u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()), 78 | (u"HTML entities & other Web oddities can be an ácute pain >:(", 79 | u"html entities and other web oddities can be an ácute".split() + [u"", u"pain", u"", u">:("]), 80 | (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.", 81 | u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split()) 82 | ] 83 | 84 | annotator = HappyFunTokenizer() 85 | 86 | for text, tokens in cases: 87 | ann = corenlp.Document() 88 | ann.text = text 89 | annotator.annotate(ann) 90 | tokens_ = [t.word for t in ann.sentencelessToken] 91 | assert tokens_ == tokens 92 | 93 | def test_annotator_alive(): 94 | annotator = HappyFunTokenizer() 95 | annotator.start() 96 | 97 | try: 98 | time.sleep(2) 99 | # Ping the annotator. 100 | r = requests.get("http://localhost:8432/ping") 101 | assert r.ok 102 | assert r.content.decode("utf-8") == "pong" 103 | r = requests.get("http://localhost:8432/ping/") 104 | assert r.ok 105 | assert r.content.decode("utf-8") == "pong" 106 | finally: 107 | annotator.terminate() 108 | annotator.join() 109 | 110 | # Ignore this test because the CustomAnnotator interface isn't a part of 111 | # StanfordCoreNLP yet. 112 | @pytest.mark.skip(reason="Ignore this test because the CustomAnnotator interface isn't a part of Stanford CoreNLP yet.") 113 | def test_tokenizer(): 114 | cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)", 115 | u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()), 116 | (u"HTML entities & other Web oddities can be an ácute pain >:(", 117 | u"html entities and other web oddities can be an ácute".split() + [u"", u"pain", u"", u">:("]), 118 | (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.", 119 | u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split()) 120 | ] 121 | 122 | annotator = HappyFunTokenizer() 123 | annotator.start() 124 | 125 | try: 126 | with corenlp.CoreNLPClient(properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client: 127 | for text, tokens in cases: 128 | ann = client.annotate(text) 129 | tokens_ = [t.word for t in ann.sentence[0].token] 130 | assert tokens == tokens_ 131 | finally: 132 | annotator.terminate() 133 | annotator.join() 134 | -------------------------------------------------------------------------------- /tests/happyfuntokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This code implements a basic, Twitter-aware tokenizer. 6 | 7 | A tokenizer is a function that splits a string of text into words. In 8 | Python terms, we map string and unicode objects into lists of unicode 9 | objects. 10 | 11 | There is not a single right way to do tokenizing. The best method 12 | depends on the application. This tokenizer is designed to be flexible 13 | and this easy to adapt to new domains and tasks. The basic logic is 14 | this: 15 | 16 | 1. The tuple regex_strings defines a list of regular expression 17 | strings. 18 | 19 | 2. The regex_strings strings are put, in order, into a compiled 20 | regular expression object called word_re. 21 | 22 | 3. The tokenization is done by word_re.findall(s), where s is the 23 | user-supplied string, inside the tokenize() method of the class 24 | Tokenizer. 25 | 26 | 4. When instantiating Tokenizer objects, there is a single option: 27 | preserve_case. By default, it is set to True. If it is set to 28 | False, then the tokenizer will downcase everything except for 29 | emoticons. 30 | 31 | The __main__ method illustrates by tokenizing a few examples. 32 | 33 | I've also included a Tokenizer method tokenize_random_tweet(). If the 34 | twitter library is installed (http://code.google.com/p/python-twitter/) 35 | and Twitter is cooperating, then it should tokenize a random 36 | English-language tweet. 37 | """ 38 | 39 | __author__ = "Christopher Potts" 40 | __copyright__ = "Copyright 2011, Christopher Potts" 41 | __credits__ = [] 42 | __license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/" 43 | __version__ = "1.0" 44 | __maintainer__ = "Christopher Potts" 45 | __email__ = "See the author's website" 46 | 47 | ###################################################################### 48 | 49 | import re 50 | from six.moves import html_entities 51 | 52 | ###################################################################### 53 | # The following strings are components in the regular expression 54 | # that is used for tokenizing. It's important that phone_number 55 | # appears first in the final regex (since it can contain whitespace). 56 | # It also could matter that tags comes after emoticons, due to the 57 | # possibility of having text like 58 | # 59 | # <:| and some text >:) 60 | # 61 | # Most imporatantly, the final element should always be last, since it 62 | # does a last ditch whitespace-based tokenization of whatever is left. 63 | 64 | # This particular element is used in a couple ways, so we define it 65 | # with a name: 66 | emoticon_string = r""" 67 | (?: 68 | [<>]? 69 | [:;=8] # eyes 70 | [\-o\*\']? # optional nose 71 | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth 72 | | 73 | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth 74 | [\-o\*\']? # optional nose 75 | [:;=8] # eyes 76 | [<>]? 77 | )""" 78 | 79 | # The components of the tokenizer: 80 | regex_strings = ( 81 | # Phone numbers: 82 | r""" 83 | (?: 84 | (?: # (international) 85 | \+?[01] 86 | [\-\s.]* 87 | )? 88 | (?: # (area code) 89 | [\(]? 90 | \d{3} 91 | [\-\s.\)]* 92 | )? 93 | \d{3} # exchange 94 | [\-\s.]* 95 | \d{4} # base 96 | )""" 97 | , 98 | # Emoticons: 99 | emoticon_string 100 | , 101 | # HTML tags: 102 | r"""<[^>]+>""" 103 | , 104 | # Twitter username: 105 | r"""(?:@[\w_]+)""" 106 | , 107 | # Twitter hashtags: 108 | r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" 109 | , 110 | # Remaining word types: 111 | r""" 112 | (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. 113 | | 114 | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. 115 | | 116 | (?:[\w_]+) # Words without apostrophes or dashes. 117 | | 118 | (?:\.(?:\s*\.){1,}) # Ellipsis dots. 119 | | 120 | (?:\S) # Everything else that isn't whitespace. 121 | """ 122 | ) 123 | 124 | ###################################################################### 125 | # This is the core tokenizing regex: 126 | 127 | word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE) 128 | 129 | # The emoticon string gets its own regex so that we can preserve case for them as needed: 130 | emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE) 131 | 132 | # These are for regularizing HTML entities to Unicode: 133 | html_entity_digit_re = re.compile(r"&#\d+;") 134 | html_entity_alpha_re = re.compile(r"&\w+;") 135 | amp = "&" 136 | 137 | ###################################################################### 138 | 139 | class Tokenizer: 140 | def __init__(self, preserve_case=False): 141 | self.preserve_case = preserve_case 142 | 143 | def tokenize(self, s): 144 | """ 145 | Argument: s -- any string or unicode object 146 | Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False 147 | """ 148 | # Try to ensure unicode: 149 | try: 150 | s = str(s) 151 | except UnicodeDecodeError: 152 | s = str(s).encode('string_escape') 153 | s = str(s) 154 | # Fix HTML character entitites: 155 | s = self.__html2unicode(s) 156 | # Tokenize: 157 | words = word_re.findall(s) 158 | # Possible alter the case, but avoid changing emoticons like :D into :d: 159 | if not self.preserve_case: 160 | words = list(map((lambda x : x if emoticon_re.search(x) else x.lower()), words)) 161 | return words 162 | 163 | def tokenize_random_tweet(self): 164 | """ 165 | If the twitter library is installed and a twitter connection 166 | can be established, then tokenize a random tweet. 167 | """ 168 | try: 169 | import twitter 170 | except ImportError: 171 | print("Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/") 172 | from random import shuffle 173 | api = twitter.Api() 174 | tweets = api.GetPublicTimeline() 175 | if tweets: 176 | for tweet in tweets: 177 | if tweet.user.lang == 'en': 178 | return self.tokenize(tweet.text) 179 | else: 180 | raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again") 181 | 182 | def __html2unicode(self, s): 183 | """ 184 | Internal metod that seeks to replace all the HTML entities in 185 | s with their corresponding unicode characters. 186 | """ 187 | # First the digits: 188 | ents = set(html_entity_digit_re.findall(s)) 189 | if len(ents) > 0: 190 | for ent in ents: 191 | entnum = ent[2:-1] 192 | try: 193 | entnum = int(entnum) 194 | s = s.replace(ent, chr(entnum)) 195 | except: 196 | pass 197 | # Now the alpha versions: 198 | ents = set(html_entity_alpha_re.findall(s)) 199 | ents = list(filter((lambda x : x != amp), ents)) 200 | for ent in ents: 201 | entname = ent[1:-1] 202 | try: 203 | s = s.replace(ent, chr(html_entities.name2codepoint[entname])) 204 | except: 205 | pass 206 | s = s.replace(amp, " and ") 207 | return s 208 | 209 | ############################################################################### 210 | 211 | if __name__ == '__main__': 212 | tok = Tokenizer(preserve_case=False) 213 | samples = ( 214 | "RT @ #happyfuncoding: this is a typical Twitter tweet :-)", 215 | "HTML entities & other Web oddities can be an ácute pain >:(", 216 | "It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace." 217 | ) 218 | 219 | for s in samples: 220 | print("======================================================================") 221 | print(s) 222 | tokenized = tok.tokenize(s) 223 | print("\n".join(tokenized)) 224 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Stanford CoreNLP Python Interface 2 | ================================= 3 | 4 | **NOTE:** This package is now deprecated. Please use the `stanza `_ package instead. 5 | 6 | 7 | .. image:: https://travis-ci.org/stanfordnlp/python-stanford-corenlp.svg?branch=master 8 | :target: https://travis-ci.org/stanfordnlp/python-stanford-corenlp 9 | 10 | This package contains a python interface for `Stanford CoreNLP 11 | `_ that contains a reference 12 | implementation to interface with the `Stanford CoreNLP server 13 | `_. 14 | The package also contains a base class to expose a python-based annotation 15 | provider (e.g. your favorite neural NER system) to the CoreNLP 16 | pipeline via a lightweight service. 17 | 18 | To use the package, first download the `official java CoreNLP release 19 | `_, unzip it, and define an environment 20 | variable :code:`$CORENLP_HOME` that points to the unzipped directory. 21 | 22 | You can also install this package from `PyPI `_ using :code:`pip install stanford-corenlp` 23 | 24 | ---- 25 | 26 | Command Line Usage 27 | ------------------ 28 | Probably the easiest way to use this package is through the `annotate` command-line utility:: 29 | 30 | usage: annotate [-h] [-i INPUT] [-o OUTPUT] [-f {json}] 31 | [-a ANNOTATORS [ANNOTATORS ...]] [-s] [-v] [-m MEMORY] 32 | [-p PROPS [PROPS ...]] 33 | 34 | Annotate data 35 | 36 | optional arguments: 37 | -h, --help show this help message and exit 38 | -i INPUT, --input INPUT 39 | Input file to process; each line contains one document 40 | (default: stdin) 41 | -o OUTPUT, --output OUTPUT 42 | File to write annotations to (default: stdout) 43 | -f {json}, --format {json} 44 | Output format 45 | -a ANNOTATORS [ANNOTATORS ...], --annotators ANNOTATORS [ANNOTATORS ...] 46 | A list of annotators 47 | -s, --sentence-mode Assume each line of input is a sentence. 48 | -v, --verbose-server Server is made verbose 49 | -m MEMORY, --memory MEMORY 50 | Memory to use for the server 51 | -p PROPS [PROPS ...], --props PROPS [PROPS ...] 52 | Properties as a list of key=value pairs 53 | 54 | 55 | We recommend using `annotate` in conjuction with the wonderful `jq` 56 | command to process the output. As an example, given a file with a 57 | sentence on each line, the following command produces an equivalent 58 | space-separated tokens:: 59 | 60 | cat file.txt | annotate -s -a tokenize | jq '[.tokens[].originalText]' > tokenized.txt 61 | 62 | 63 | Annotation Server Usage 64 | ----------------------- 65 | 66 | .. code-block:: python 67 | 68 | import corenlp 69 | 70 | text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP." 71 | 72 | # We assume that you've downloaded Stanford CoreNLP and defined an environment 73 | # variable $CORENLP_HOME that points to the unzipped directory. 74 | # The code below will launch StanfordCoreNLPServer in the background 75 | # and communicate with the server to annotate the sentence. 76 | with corenlp.CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split()) as client: 77 | ann = client.annotate(text) 78 | 79 | # You can access annotations using ann. 80 | sentence = ann.sentence[0] 81 | 82 | # The corenlp.to_text function is a helper function that 83 | # reconstructs a sentence from tokens. 84 | assert corenlp.to_text(sentence) == text 85 | 86 | # You can access any property within a sentence. 87 | print(sentence.text) 88 | 89 | # Likewise for tokens 90 | token = sentence.token[0] 91 | print(token.lemma) 92 | 93 | # Use tokensregex patterns to find who wrote a sentence. 94 | pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/' 95 | matches = client.tokensregex(text, pattern) 96 | # sentences contains a list with matches for each sentence. 97 | assert len(matches["sentences"]) == 1 98 | # length tells you whether or not there are any matches in this 99 | assert matches["sentences"][0]["length"] == 1 100 | # You can access matches like most regex groups. 101 | matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence" 102 | matches["sentences"][1]["0"]["1"]["text"] == "Chris" 103 | 104 | # Use semgrex patterns to directly find who wrote what. 105 | pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object' 106 | matches = client.semgrex(text, pattern) 107 | # sentences contains a list with matches for each sentence. 108 | assert len(matches["sentences"]) == 1 109 | # length tells you whether or not there are any matches in this 110 | assert matches["sentences"][0]["length"] == 1 111 | # You can access matches like most regex groups. 112 | matches["sentences"][1]["0"]["text"] == "wrote" 113 | matches["sentences"][1]["0"]["$subject"]["text"] == "Chris" 114 | matches["sentences"][1]["0"]["$object"]["text"] == "sentence" 115 | 116 | See `test_client.py` and `test_protobuf.py` for more examples. Props to 117 | @dan-zheng for tokensregex/semgrex support. 118 | 119 | 120 | Annotation Service Usage 121 | ------------------------ 122 | 123 | *NOTE*: The annotation service allows users to provide a custom 124 | annotator to be used by the CoreNLP pipeline. Unfortunately, it relies 125 | on experimental code internal to the Stanford CoreNLP project is not yet 126 | available for public use. 127 | 128 | .. code-block:: python 129 | 130 | import corenlp 131 | from .happyfuntokenizer import Tokenizer 132 | 133 | class HappyFunTokenizer(Tokenizer, corenlp.Annotator): 134 | def __init__(self, preserve_case=False): 135 | Tokenizer.__init__(self, preserve_case) 136 | corenlp.Annotator.__init__(self) 137 | 138 | @property 139 | def name(self): 140 | """ 141 | Name of the annotator (used by CoreNLP) 142 | """ 143 | return "happyfun" 144 | 145 | @property 146 | def requires(self): 147 | """ 148 | Requires has to specify all the annotations required before we 149 | are called. 150 | """ 151 | return [] 152 | 153 | @property 154 | def provides(self): 155 | """ 156 | The set of annotations guaranteed to be provided when we are done. 157 | NOTE: that these annotations are either fully qualified Java 158 | class names or refer to nested classes of 159 | edu.stanford.nlp.ling.CoreAnnotations (as is the case below). 160 | """ 161 | return ["TextAnnotation", 162 | "TokensAnnotation", 163 | "TokenBeginAnnotation", 164 | "TokenEndAnnotation", 165 | "CharacterOffsetBeginAnnotation", 166 | "CharacterOffsetEndAnnotation", 167 | ] 168 | 169 | def annotate(self, ann): 170 | """ 171 | @ann: is a protobuf annotation object. 172 | Actually populate @ann with tokens. 173 | """ 174 | buf, beg_idx, end_idx = ann.text.lower(), 0, 0 175 | for i, word in enumerate(self.tokenize(ann.text)): 176 | token = ann.sentencelessToken.add() 177 | # These are the bare minimum required for the TokenAnnotation 178 | token.word = word 179 | token.tokenBeginIndex = i 180 | token.tokenEndIndex = i+1 181 | 182 | # Seek into the txt until you can find this word. 183 | try: 184 | # Try to update beginning index 185 | beg_idx = buf.index(word, beg_idx) 186 | except ValueError: 187 | # Give up -- this will be something random 188 | end_idx = beg_idx + len(word) 189 | 190 | token.beginChar = beg_idx 191 | token.endChar = end_idx 192 | 193 | beg_idx, end_idx = end_idx, end_idx 194 | 195 | annotator = HappyFunTokenizer() 196 | # Calling .start() will launch the annotator as a service running on 197 | # port 8432 by default. 198 | annotator.start() 199 | 200 | # annotator.properties contains all the right properties for 201 | # Stanford CoreNLP to use this annotator. 202 | with corenlp.CoreNLPClient(properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client: 203 | ann = client.annotate("RT @ #happyfuncoding: this is a typical Twitter tweet :-)") 204 | 205 | tokens = [t.word for t in ann.sentence[0].token] 206 | print(tokens) 207 | 208 | 209 | See `test_annotator.py` for more examples. 210 | -------------------------------------------------------------------------------- /corenlp/client.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Python CoreNLP: a server based interface to Java CoreNLP. 3 | """ 4 | import io 5 | import os 6 | import logging 7 | import json 8 | import shlex 9 | import subprocess 10 | import time 11 | import sys 12 | import datetime 13 | 14 | from six.moves.urllib.parse import urlparse 15 | 16 | import requests 17 | 18 | from corenlp_protobuf import Document, parseFromDelimitedString, writeToDelimitedString, to_text 19 | __author__ = 'arunchaganty, kelvinguu, vzhong, wmonroe4' 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | class AnnotationException(Exception): 24 | """ 25 | Exception raised when there was an error communicating with the CoreNLP server. 26 | """ 27 | pass 28 | 29 | class TimeoutException(AnnotationException): 30 | """ 31 | Exception raised when the CoreNLP server timed out. 32 | """ 33 | pass 34 | 35 | class ShouldRetryException(Exception): 36 | """ 37 | Exception raised if the service should retry the request. 38 | """ 39 | pass 40 | 41 | class PermanentlyFailedException(Exception): 42 | """ 43 | Exception raised if the service should retry the request. 44 | """ 45 | pass 46 | 47 | class RobustService(object): 48 | """ 49 | Service that resuscitates itself if it is not available. 50 | """ 51 | TIMEOUT = 15 52 | 53 | def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout, 54 | stderr=sys.stderr, be_quiet=False): 55 | self.start_cmd = start_cmd and shlex.split(start_cmd) 56 | self.stop_cmd = stop_cmd and shlex.split(stop_cmd) 57 | self.endpoint = endpoint 58 | self.stdout = stdout 59 | self.stderr = stderr 60 | 61 | self.server = None 62 | self.is_active = False 63 | self.be_quiet = be_quiet 64 | 65 | def is_alive(self): 66 | try: 67 | return requests.get(self.endpoint + "/ping").ok 68 | except requests.exceptions.ConnectionError as e: 69 | raise ShouldRetryException(e) 70 | 71 | def start(self): 72 | if self.start_cmd: 73 | if self.be_quiet: 74 | # Issue #26: subprocess.DEVNULL isn't supported in python 2.7. 75 | stderr = open(os.devnull, 'w') 76 | else: 77 | stderr = self.stderr 78 | self.server = subprocess.Popen(self.start_cmd, 79 | stderr=stderr, 80 | stdout=stderr) 81 | 82 | def stop(self): 83 | if self.server: 84 | self.server.kill() 85 | if self.stop_cmd: 86 | subprocess.run(self.stop_cmd, check=True) 87 | self.is_active = False 88 | 89 | def __enter__(self): 90 | self.start() 91 | return self 92 | 93 | def __exit__(self, _, __, ___): 94 | self.stop() 95 | 96 | def ensure_alive(self): 97 | # Check if the service is active and alive 98 | if self.is_active: 99 | try: 100 | return self.is_alive() 101 | except ShouldRetryException: 102 | pass 103 | 104 | # If not, try to start up the service. 105 | if self.server is None: 106 | self.start() 107 | 108 | # Wait for the service to start up. 109 | start_time = time.time() 110 | while True: 111 | try: 112 | if self.is_alive(): 113 | break 114 | except ShouldRetryException: 115 | pass 116 | 117 | if time.time() - start_time < self.TIMEOUT: 118 | time.sleep(1) 119 | else: 120 | raise PermanentlyFailedException("Timed out waiting for service to come alive.") 121 | 122 | # At this point we are guaranteed that the service is alive. 123 | self.is_active = True 124 | 125 | class CoreNLPClient(RobustService): 126 | """ 127 | A CoreNLP client to the Stanford CoreNLP server. 128 | """ 129 | DEFAULT_ANNOTATORS = "tokenize ssplit lemma pos ner depparse".split() 130 | DEFAULT_PROPERTIES = {} 131 | DEFAULT_OUTPUT_FORMAT = "serialized" 132 | 133 | def __init__(self, start_server=True, 134 | endpoint="http://localhost:9000", 135 | timeout=15000, 136 | threads=5, 137 | annotators=None, 138 | properties=None, 139 | output_format=None, 140 | stdout=sys.stdout, 141 | stderr=sys.stderr, 142 | memory="4G", 143 | be_quiet=True, 144 | max_char_length=100000 145 | ): 146 | if isinstance(annotators, str): 147 | annotators = annotators.split() 148 | 149 | if start_server: 150 | host, port = urlparse(endpoint).netloc.split(":") 151 | assert host == "localhost", "If starting a server, endpoint must be localhost" 152 | 153 | assert os.getenv("CORENLP_HOME") is not None, "Please define $CORENLP_HOME where your CoreNLP Java checkout is" 154 | start_cmd = "java -Xmx{memory} -cp '{corenlp_home}/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length}".format( 155 | corenlp_home=os.getenv("CORENLP_HOME"), 156 | port=port, 157 | memory=memory, 158 | timeout=timeout, 159 | threads=threads, 160 | max_char_length=max_char_length) 161 | stop_cmd = None 162 | else: 163 | start_cmd = stop_cmd = None 164 | 165 | super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint, 166 | stdout, stderr, be_quiet) 167 | self.timeout = timeout 168 | self.default_annotators = annotators or self.DEFAULT_ANNOTATORS 169 | self.default_properties = properties or self.DEFAULT_PROPERTIES 170 | self.default_output_format = output_format or self.DEFAULT_OUTPUT_FORMAT 171 | 172 | def _request(self, buf, properties, date=None): 173 | """Send a request to the CoreNLP server. 174 | 175 | :param (str | unicode) text: raw text for the CoreNLPServer to parse 176 | :param (dict) properties: properties that the server expects 177 | :param (str) date: reference date of document, used by server to set docDate - expects YYYY-MM-DD 178 | :return: request result 179 | """ 180 | self.ensure_alive() 181 | 182 | try: 183 | input_format = properties.get("inputFormat", "text") 184 | if input_format == "text": 185 | ctype = "text/plain; charset=utf-8" 186 | elif input_format == "serialized": 187 | ctype = "application/x-protobuf" 188 | else: 189 | raise ValueError("Unrecognized inputFormat " + input_format) 190 | 191 | if date: 192 | params = {'properties': str(properties),'date': str(date)} 193 | else: 194 | params = {'properties': str(properties)} 195 | 196 | r = requests.post(self.endpoint, 197 | params=params, 198 | data=buf, headers={'content-type': ctype}, 199 | timeout=(self.timeout*2)/1000) 200 | r.raise_for_status() 201 | return r 202 | except requests.HTTPError as e: 203 | if r.text == "CoreNLP request timed out. Your document may be too long.": 204 | raise TimeoutException(r.text) 205 | else: 206 | raise AnnotationException(r.text) 207 | 208 | def annotate(self, text, annotators=None, output_format=None, properties=None, date=None): 209 | """Send a request to the CoreNLP server. 210 | 211 | :param (str | unicode) text: raw text for the CoreNLPServer to parse 212 | :param (list | string) annotators: list of annotators to use 213 | :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml 214 | :param (dict) properties: properties that the server expects 215 | :return: request result 216 | """ 217 | # set properties for server call 218 | if properties is None: 219 | properties = self.default_properties 220 | properties.update({ 221 | 'annotators': ','.join(annotators or self.default_annotators), 222 | 'inputFormat': 'text', 223 | 'outputFormat': self.default_output_format, 224 | 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' 225 | }) 226 | elif "annotators" not in properties: 227 | properties.update({'annotators': ','.join(annotators or self.default_annotators)}) 228 | # if an output_format is specified, use that to override 229 | if output_format is not None: 230 | properties["outputFormat"] = output_format 231 | # make the request 232 | r = self._request(text.encode('utf-8'), properties, date) 233 | # customize what is returned based outputFormat 234 | if properties["outputFormat"] == "serialized": 235 | doc = Document() 236 | parseFromDelimitedString(doc, r.content) 237 | return doc 238 | elif properties["outputFormat"] == "json": 239 | return r.json() 240 | elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]: 241 | return r.text 242 | else: 243 | return r 244 | 245 | def update(self, doc, annotators=None, properties=None, date=None): 246 | if properties is None: 247 | properties = self.default_properties 248 | properties.update({ 249 | 'annotators': ','.join(annotators or self.default_annotators), 250 | 'inputFormat': 'serialized', 251 | 'outputFormat': 'serialized', 252 | 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' 253 | }) 254 | with io.BytesIO() as stream: 255 | writeToDelimitedString(doc, stream) 256 | msg = stream.getvalue() 257 | 258 | r = self._request(msg, properties, date) 259 | doc = Document() 260 | parseFromDelimitedString(doc, r.content) 261 | return doc 262 | 263 | def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None): 264 | # this is required for some reason 265 | matches = self.__regex('/tokensregex', text, pattern, filter, annotators, properties) 266 | if to_words: 267 | matches = regex_matches_to_indexed_words(matches) 268 | return matches 269 | 270 | def semgrex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None): 271 | matches = self.__regex('/semgrex', text, pattern, filter, annotators, properties) 272 | if to_words: 273 | matches = regex_matches_to_indexed_words(matches) 274 | return matches 275 | 276 | def tregrex(self, text, pattern, filter=False, annotators=None, properties=None): 277 | return self.__regex('/tregex', text, pattern, filter, annotators, properties) 278 | 279 | def __regex(self, path, text, pattern, filter, annotators=None, properties=None): 280 | """Send a regex-related request to the CoreNLP server. 281 | :param (str | unicode) path: the path for the regex endpoint 282 | :param text: raw text for the CoreNLPServer to apply the regex 283 | :param (str | unicode) pattern: regex pattern 284 | :param (bool) filter: option to filter sentences that contain matches, if false returns matches 285 | :param properties: option to filter sentences that contain matches, if false returns matches 286 | :return: request result 287 | """ 288 | self.ensure_alive() 289 | if properties is None: 290 | properties = self.default_properties 291 | properties.update({ 292 | 'annotators': ','.join(annotators or self.default_annotators), 293 | 'inputFormat': 'text', 294 | 'outputFormat': self.default_output_format, 295 | 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' 296 | }) 297 | elif "annotators" not in properties: 298 | properties.update({'annotators': ','.join(annotators or self.default_annotators)}) 299 | 300 | # HACK: For some stupid reason, CoreNLPServer will timeout if we 301 | # need to annotate something from scratch. So, we need to call 302 | # this to ensure that the _regex call doesn't timeout. 303 | self.annotate(text, properties=properties) 304 | 305 | try: 306 | # Error occurs unless put properties in params 307 | input_format = properties.get("inputFormat", "text") 308 | if input_format == "text": 309 | ctype = "text/plain; charset=utf-8" 310 | elif input_format == "serialized": 311 | ctype = "application/x-protobuf" 312 | else: 313 | raise ValueError("Unrecognized inputFormat " + input_format) 314 | # change request method from `get` to `post` as required by CoreNLP 315 | r = requests.post( 316 | self.endpoint + path, params={ 317 | 'pattern': pattern, 318 | 'filter': filter, 319 | 'properties': str(properties) 320 | }, data=text, 321 | headers={'content-type': ctype}, 322 | timeout=(self.timeout*2)/1000, 323 | ) 324 | r.raise_for_status() 325 | return json.loads(r.text) 326 | except requests.HTTPError as e: 327 | if r.text.startswith("Timeout"): 328 | raise TimeoutException(r.text) 329 | else: 330 | raise AnnotationException(r.text) 331 | except json.JSONDecodeError: 332 | raise AnnotationException(r.text) 333 | 334 | def regex_matches_to_indexed_words(matches): 335 | """Transforms tokensregex and semgrex matches to indexed words. 336 | :param matches: unprocessed regex matches 337 | :return: flat array of indexed words 338 | """ 339 | words = [dict(v, **dict([('sentence', i)])) 340 | for i, s in enumerate(matches['sentences']) 341 | for k, v in s.items() if k != 'length'] 342 | return words 343 | 344 | __all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"] 345 | -------------------------------------------------------------------------------- /doc/CoreNLP.proto: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp.pipeline; 2 | 3 | option java_package = "edu.stanford.nlp.pipeline"; 4 | option java_outer_classname = "CoreNLPProtos"; 5 | 6 | // 7 | // From JAVANLP_HOME, you can build me with the command: 8 | // 9 | // protoc -I=projects/core/src/edu/stanford/nlp/pipeline/ --java_out=projects/core/src projects/core/src/edu/stanford/nlp/pipeline/CoreNLP.proto 10 | // 11 | 12 | // 13 | // An enumeration for the valid languages allowed in CoreNLP 14 | // 15 | enum Language { 16 | Unknown = 0; 17 | Any = 1; 18 | Arabic = 2; 19 | Chinese = 3; 20 | English = 4; 21 | German = 5; 22 | French = 6; 23 | Hebrew = 7; 24 | Spanish = 8; 25 | UniversalEnglish = 9; 26 | } 27 | 28 | // 29 | // A document; that is, the equivalent of an Annotation. 30 | // 31 | message Document { 32 | required string text = 1; 33 | repeated Sentence sentence = 2; 34 | repeated CorefChain corefChain = 3; 35 | optional string docID = 4; 36 | optional string docDate = 7; 37 | optional uint64 calendar = 8; 38 | 39 | /** 40 | * A peculiar field, for the corner case when a Document is 41 | * serialized without any sentences. Otherwise 42 | */ 43 | repeated Token sentencelessToken = 5; 44 | 45 | repeated Quote quote = 6; 46 | /** 47 | * This field is for entity mentions across the document. 48 | */ 49 | repeated NERMention mentions = 9; 50 | 51 | extensions 100 to 255; 52 | } 53 | 54 | // 55 | // The serialized version of a CoreMap representing a sentence. 56 | // 57 | message Sentence { 58 | repeated Token token = 1; 59 | required uint32 tokenOffsetBegin = 2; 60 | required uint32 tokenOffsetEnd = 3; 61 | optional uint32 sentenceIndex = 4; 62 | optional uint32 characterOffsetBegin = 5; 63 | optional uint32 characterOffsetEnd = 6; 64 | optional ParseTree parseTree = 7; 65 | optional ParseTree binarizedParseTree = 31; 66 | optional ParseTree annotatedParseTree = 32; 67 | optional string sentiment = 33; 68 | repeated ParseTree kBestParseTrees = 34; 69 | optional DependencyGraph basicDependencies = 8; 70 | optional DependencyGraph collapsedDependencies = 9; 71 | optional DependencyGraph collapsedCCProcessedDependencies = 10; 72 | optional DependencyGraph alternativeDependencies = 13; 73 | repeated RelationTriple openieTriple = 14; // The OpenIE triples in the sentence 74 | repeated RelationTriple kbpTriple = 16; // The KBP triples in this sentence 75 | repeated SentenceFragment entailedSentence = 15; // The entailed sentences, by natural logic 76 | optional DependencyGraph enhancedDependencies = 17; 77 | optional DependencyGraph enhancedPlusPlusDependencies = 18; 78 | 79 | optional uint32 paragraph = 11; 80 | 81 | optional string text = 12; // Only needed if we're only saving the sentence. 82 | 83 | 84 | // Fields set by other annotators in CoreNLP 85 | optional bool hasRelationAnnotations = 51; 86 | repeated Entity entity = 52; 87 | repeated Relation relation = 53; 88 | optional bool hasNumerizedTokensAnnotation = 54; 89 | repeated NERMention mentions = 55; 90 | repeated Mention mentionsForCoref = 56; 91 | optional bool hasCorefMentionsAnnotation = 57; 92 | 93 | optional string sentenceID = 58; // Useful when storing sentences (e.g. ForEach) 94 | 95 | extensions 100 to 255; 96 | } 97 | 98 | // 99 | // The serialized version of a Token (a CoreLabel). 100 | // 101 | message Token { 102 | // Fields set by the default annotators [new CoreNLP(new Properties())] 103 | required string word = 1; // the word's gloss (post-tokenization) 104 | optional string pos = 2; // The word's part of speech tag 105 | optional string value = 3; // The word's 'value', (e.g., parse tree node) 106 | optional string category = 4; // The word's 'category' (e.g., parse tree node) 107 | optional string before = 5; // The whitespace/xml before the token 108 | optional string after = 6; // The whitespace/xml after the token 109 | optional string originalText = 7; // The original text for this token 110 | optional string ner = 8; // The word's NER tag 111 | optional string normalizedNER = 9; // The word's normalized NER tag 112 | optional string lemma = 10; // The word's lemma 113 | optional uint32 beginChar = 11; // The character offset begin, in the document 114 | optional uint32 endChar = 12; // The character offset end, in the document 115 | optional uint32 utterance = 13; // The utterance tag used in dcoref 116 | optional string speaker = 14; // The speaker speaking this word 117 | optional uint32 beginIndex = 15; // The begin index of, e.g., a span 118 | optional uint32 endIndex = 16; // The begin index of, e.g., a span 119 | optional uint32 tokenBeginIndex = 17; // The begin index of the token 120 | optional uint32 tokenEndIndex = 18; // The end index of the token 121 | optional Timex timexValue = 19; // The time this word refers to 122 | optional bool hasXmlContext = 21; // Used by clean xml annotator 123 | repeated string xmlContext = 22; // Used by clean xml annotator 124 | optional uint32 corefClusterID = 23; // The [primary] cluster id for this token 125 | optional string answer = 24; // A temporary annotation which is occasionally left in 126 | // optional string projectedCategory = 25; // The syntactic category of the maximal constituent headed by the word. Not used anywhere, so deleted. 127 | optional uint32 headWordIndex = 26; // The index of the head word of this word. 128 | optional Operator operator = 27; // If this is an operator, which one is it and what is its scope (as per Natural Logic)? 129 | optional Polarity polarity = 28; // The polarity of this word, according to Natural Logic 130 | optional Span span = 29; // The span of a leaf node of a tree 131 | optional string sentiment = 30; // The final sentiment of the sentence 132 | optional int32 quotationIndex = 31; // The index of the quotation this token refers to 133 | optional MapStringString conllUFeatures = 32; 134 | optional string coarseTag = 33; // The coarse POS tag (used to store the UPOS tag) 135 | optional Span conllUTokenSpan = 34; 136 | optional string conllUMisc = 35; 137 | optional MapIntString conllUSecondaryDeps = 36; 138 | optional string wikipediaEntity = 37; 139 | 140 | 141 | // Fields set by other annotators in CoreNLP 142 | optional string gender = 51; // gender annotation (machine reading) 143 | optional string trueCase = 52; // true case type of token 144 | optional string trueCaseText = 53; // true case gloss of token 145 | 146 | // Fields in the CoreLabel java class that are moved elsewhere 147 | // string text @see Document#text + character offsets 148 | // uint32 sentenceIndex @see Sentence#sentenceIndex 149 | // string docID @see Document#docID 150 | // uint32 index @see implicit in Sentence 151 | // uint32 paragraph @see Sentence#paragraph 152 | 153 | extensions 100 to 255; 154 | } 155 | 156 | // 157 | // An enumeration of valid sentiment values for the sentiment classifier. 158 | // 159 | enum Sentiment { 160 | STRONG_NEGATIVE = 0; 161 | WEAK_NEGATIVE = 1; 162 | NEUTRAL = 2; 163 | WEAK_POSITIVE = 3; 164 | STRONG_POSITIVE = 4; 165 | } 166 | 167 | // 168 | // A quotation marker in text 169 | // 170 | message Quote { 171 | optional string text = 1; 172 | optional uint32 begin = 2; 173 | optional uint32 end = 3; 174 | optional uint32 sentenceBegin = 5; 175 | optional uint32 sentenceEnd = 6; 176 | optional uint32 tokenBegin = 7; 177 | optional uint32 tokenEnd = 8; 178 | optional string docid = 9; 179 | optional uint32 index = 10; 180 | } 181 | 182 | // 183 | // A syntactic parse tree, with scores. 184 | // 185 | message ParseTree { 186 | repeated ParseTree child = 1; 187 | optional string value = 2; 188 | optional uint32 yieldBeginIndex = 3; 189 | optional uint32 yieldEndIndex = 4; 190 | optional double score = 5; 191 | optional Sentiment sentiment = 6; 192 | } 193 | 194 | // 195 | // A dependency graph representation. 196 | // 197 | message DependencyGraph { 198 | message Node { 199 | required uint32 sentenceIndex = 1; 200 | required uint32 index = 2; 201 | optional uint32 copyAnnotation = 3; 202 | } 203 | 204 | message Edge { 205 | required uint32 source = 1; 206 | required uint32 target = 2; 207 | optional string dep = 3; 208 | optional bool isExtra = 4; 209 | optional uint32 sourceCopy = 5; 210 | optional uint32 targetCopy = 6; 211 | optional Language language = 7 [default=Unknown]; 212 | } 213 | 214 | repeated Node node = 1; 215 | repeated Edge edge = 2; 216 | repeated uint32 root = 3 [packed=true]; 217 | } 218 | 219 | // 220 | // A coreference chain. 221 | // These fields are not *really* optional. CoreNLP will crash without them. 222 | // 223 | message CorefChain { 224 | message CorefMention { 225 | optional int32 mentionID = 1; 226 | optional string mentionType = 2; 227 | optional string number = 3; 228 | optional string gender = 4; 229 | optional string animacy = 5; 230 | optional uint32 beginIndex = 6; 231 | optional uint32 endIndex = 7; 232 | optional uint32 headIndex = 9; 233 | optional uint32 sentenceIndex = 10; 234 | optional uint32 position = 11; // the second element of position 235 | } 236 | 237 | required int32 chainID = 1; 238 | repeated CorefMention mention = 2; 239 | required uint32 representative = 3; 240 | } 241 | 242 | // 243 | // a mention 244 | // 245 | 246 | message Mention { 247 | optional int32 mentionID = 1; 248 | optional string mentionType = 2; 249 | optional string number = 3; 250 | optional string gender = 4; 251 | optional string animacy = 5; 252 | optional string person = 6; 253 | optional uint32 startIndex = 7; 254 | optional uint32 endIndex = 9; 255 | optional uint32 headIndex = 10; 256 | optional string headString = 11; 257 | optional string nerString = 12; 258 | optional uint32 originalRef = 13; 259 | optional int32 goldCorefClusterID = 14; 260 | optional int32 corefClusterID = 15; 261 | optional uint32 mentionNum = 16; 262 | optional uint32 sentNum = 17; 263 | optional uint32 utter = 18; 264 | optional uint32 paragraph = 19; 265 | optional bool isSubject = 20; 266 | optional bool isDirectObject = 21; 267 | optional bool isIndirectObject = 22; 268 | optional bool isPrepositionObject = 23; 269 | optional bool hasTwin = 24; 270 | optional bool generic = 25; 271 | optional bool isSingleton = 26; 272 | optional bool hasBasicDependency = 27; 273 | optional bool hasEnhancedDepenedncy = 28; 274 | optional bool hasContextParseTree = 29; 275 | optional IndexedWord headIndexedWord = 30; 276 | optional IndexedWord dependingVerb = 31; 277 | optional IndexedWord headWord = 32; 278 | optional SpeakerInfo speakerInfo = 33; 279 | 280 | repeated IndexedWord sentenceWords = 50; 281 | repeated IndexedWord originalSpan = 51; 282 | repeated string dependents = 52; 283 | repeated string preprocessedTerms = 53; 284 | repeated int32 appositions = 54; 285 | repeated int32 predicateNominatives = 55; 286 | repeated int32 relativePronouns = 56; 287 | repeated int32 listMembers = 57; 288 | repeated int32 belongToLists = 58; 289 | 290 | } 291 | 292 | // 293 | // store the position (sentence, token index) of a CoreLabel 294 | // 295 | 296 | message IndexedWord { 297 | optional uint32 sentenceNum = 1; 298 | optional uint32 tokenIndex = 2; 299 | optional uint32 docID = 3; 300 | optional uint32 copyCount = 4; 301 | } 302 | 303 | // 304 | // speaker info, this is used for Mentions 305 | // 306 | 307 | message SpeakerInfo { 308 | optional string speakerName = 1; 309 | repeated int32 mentions = 2; 310 | } 311 | 312 | // 313 | // A Span of text 314 | // 315 | message Span { 316 | required uint32 begin = 1; 317 | required uint32 end = 2; 318 | } 319 | 320 | // 321 | // A Timex object, representing a temporal expression (TIMe EXpression) 322 | // These fields are not *really* optional. CoreNLP will crash without them. 323 | // 324 | message Timex { 325 | optional string value = 1; 326 | optional string altValue = 2; 327 | optional string text = 3; 328 | optional string type = 4; 329 | optional string tid = 5; 330 | optional uint32 beginPoint = 6; 331 | optional uint32 endPoint = 7; 332 | } 333 | 334 | // 335 | // A representation of an entity in a relation. 336 | // This corresponds to the EntityMention, and more broadly the 337 | // ExtractionObject classes. 338 | // 339 | message Entity { 340 | optional uint32 headStart = 6; 341 | optional uint32 headEnd = 7; 342 | optional string mentionType = 8; 343 | optional string normalizedName = 9; 344 | optional uint32 headTokenIndex = 10; 345 | optional string corefID = 11; 346 | // inherited from ExtractionObject 347 | optional string objectID = 1; 348 | optional uint32 extentStart = 2; 349 | optional uint32 extentEnd = 3; 350 | optional string type = 4; 351 | optional string subtype = 5; 352 | // Implicit 353 | // uint32 sentence @see implicit in sentence 354 | } 355 | 356 | // 357 | // A representation of a relation, mirroring RelationMention 358 | // 359 | message Relation { 360 | repeated string argName = 6; 361 | repeated Entity arg = 7; 362 | optional string signature = 8; 363 | // inherited from ExtractionObject 364 | optional string objectID = 1; 365 | optional uint32 extentStart = 2; 366 | optional uint32 extentEnd = 3; 367 | optional string type = 4; 368 | optional string subtype = 5; 369 | // Implicit 370 | // uint32 sentence @see implicit in sentence 371 | } 372 | 373 | // 374 | // A Natural Logic operator 375 | // 376 | message Operator { 377 | required string name = 1; 378 | required int32 quantifierSpanBegin = 2; 379 | required int32 quantifierSpanEnd = 3; 380 | required int32 subjectSpanBegin = 4; 381 | required int32 subjectSpanEnd = 5; 382 | required int32 objectSpanBegin = 6; 383 | required int32 objectSpanEnd = 7; 384 | } 385 | 386 | // 387 | // The seven informative Natural Logic relations 388 | // 389 | enum NaturalLogicRelation { 390 | EQUIVALENCE = 0; 391 | FORWARD_ENTAILMENT = 1; 392 | REVERSE_ENTAILMENT = 2; 393 | NEGATION = 3; 394 | ALTERNATION = 4; 395 | COVER = 5; 396 | INDEPENDENCE = 6; 397 | } 398 | 399 | // 400 | // The polarity of a word, according to Natural Logic 401 | // 402 | message Polarity { 403 | required NaturalLogicRelation projectEquivalence = 1; 404 | required NaturalLogicRelation projectForwardEntailment = 2; 405 | required NaturalLogicRelation projectReverseEntailment = 3; 406 | required NaturalLogicRelation projectNegation = 4; 407 | required NaturalLogicRelation projectAlternation = 5; 408 | required NaturalLogicRelation projectCover = 6; 409 | required NaturalLogicRelation projectIndependence = 7; 410 | } 411 | 412 | // 413 | // An NER mention in the text 414 | // 415 | message NERMention { 416 | optional uint32 sentenceIndex = 1; 417 | required uint32 tokenStartInSentenceInclusive = 2; 418 | required uint32 tokenEndInSentenceExclusive = 3; 419 | required string ner = 4; 420 | optional string normalizedNER = 5; 421 | optional string entityType = 6; 422 | optional Timex timex = 7; 423 | optional string wikipediaEntity = 8; 424 | } 425 | 426 | // 427 | // An entailed sentence fragment. 428 | // Created by the openie annotator. 429 | // 430 | message SentenceFragment { 431 | repeated uint32 tokenIndex = 1; 432 | optional uint32 root = 2; 433 | optional bool assumedTruth = 3; 434 | optional double score = 4; 435 | } 436 | 437 | 438 | // 439 | // The index of a token in a document, including the sentence 440 | // index and the offset. 441 | // 442 | message TokenLocation { 443 | optional uint32 sentenceIndex = 1; 444 | optional uint32 tokenIndex = 2; 445 | 446 | } 447 | 448 | 449 | // 450 | // An OpenIE relation triple. 451 | // Created by the openie annotator. 452 | // 453 | message RelationTriple { 454 | optional string subject = 1; // The surface form of the subject 455 | optional string relation = 2; // The surface form of the relation (required) 456 | optional string object = 3; // The surface form of the object 457 | optional double confidence = 4; // The [optional] confidence of the extraction 458 | repeated TokenLocation subjectTokens = 13; // The tokens comprising the subject of the triple 459 | repeated TokenLocation relationTokens = 14; // The tokens comprising the relation of the triple 460 | repeated TokenLocation objectTokens = 15; // The tokens comprising the object of the triple 461 | optional DependencyGraph tree = 8; // The dependency graph fragment for this triple 462 | optional bool istmod = 9; // If true, this expresses an implicit tmod relation 463 | optional bool prefixBe = 10; // If true, this relation string is missing a 'be' prefix 464 | optional bool suffixBe = 11; // If true, this relation string is missing a 'be' suffix 465 | optional bool suffixOf = 12; // If true, this relation string is missing a 'of' prefix 466 | } 467 | 468 | 469 | // 470 | // A map from strings to strings. 471 | // Used, minimally, in the CoNLLU featurizer 472 | // 473 | message MapStringString { 474 | repeated string key = 1; 475 | repeated string value = 2; 476 | } 477 | 478 | // 479 | // A map from integers to strings. 480 | // Used, minimally, in the CoNLLU featurizer 481 | // 482 | message MapIntString { 483 | repeated uint32 key = 1; 484 | repeated string value = 2; 485 | } 486 | 487 | --------------------------------------------------------------------------------