├── tests
    ├── data
    │   └── test.dat
    ├── __init__.py
    ├── test_client.py
    ├── test_protobuf.py
    ├── test_annotator.py
    └── happyfuntokenizer.py
├── CHANGELOG
├── MANIFEST.in
├── setup.cfg
├── .travis.yml
├── .github
    └── ISSUE_TEMPLATE
    │   └── use-stanza-instead.md
├── corenlp
    ├── __init__.py
    ├── main.py
    ├── annotator.py
    └── client.py
├── tox.ini
├── LICENSE
├── .gitignore
├── setup.py
├── README.rst
└── doc
    └── CoreNLP.proto


/tests/data/test.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/python-stanford-corenlp/HEAD/tests/data/test.dat


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | 3.7.1 - Fixed some bugs with tests for stanford-corenlp and renamed to stanford-corenlp 
2 | 3.7.0 - Initial release
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the license file
2 | include *.rst
3 | include LICENSE
4 | include CHANGELOG
5 | 
6 | # Include the data files
7 | recursive-include corenlp *.py
8 | recursive-include doc *.proto
9 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # the inclusion of the tests module is not meant to offer best practices for
2 | # testing in general, but rather to support the `find_packages` example in
3 | # setup.py that excludes installing the "tests" package
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says that the code is written to work on both Python 2 and Python
3 | # 3. If at all possible, it is good practice to do this. If you cannot, you
4 | # will need to generate wheels for each Python version that you support.
5 | universal=1
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 |   - 3.5
 5 | notifications:
 6 |   email: false
 7 | before_install:
 8 |   - sudo apt-get update
 9 | install:
10 |   - pip install tox-travis
11 | # Run test
12 | script: 
13 |   - tox
14 | # only integrate the master branch
15 | branches:
16 |   only:
17 |     - master
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/use-stanza-instead.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Use stanza instead
 3 | about: Stop using this repo
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | This repo is no longer supported.  Please use stanza instead.
11 | 
12 | https://github.com/stanfordnlp/stanza
13 | 
14 | If this new issue is not completely ignoring, the only response you will get will be to use stanza instead.
15 | 


--------------------------------------------------------------------------------
/corenlp/__init__.py:
--------------------------------------------------------------------------------
 1 | from corenlp_protobuf import to_text
 2 | from corenlp_protobuf import Document, Sentence, Token, IndexedWord, Span
 3 | from corenlp_protobuf import ParseTree, DependencyGraph, CorefChain
 4 | from corenlp_protobuf import Mention, NERMention, Entity, Relation, RelationTriple, Timex
 5 | from corenlp_protobuf import Quote, SpeakerInfo
 6 | from corenlp_protobuf import Operator, Polarity
 7 | from corenlp_protobuf import SentenceFragment, TokenLocation
 8 | from corenlp_protobuf import MapStringString, MapIntString
 9 | from .client import CoreNLPClient, AnnotationException, TimeoutException
10 | from .annotator import Annotator
11 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # this file is *not* meant to cover or endorse the use of tox or pytest or
 2 | # testing in general,
 3 | #
 4 | #  It's meant to show the use of:
 5 | #
 6 | #  - check-manifest
 7 | #     confirm items checked into vcs are in your sdist
 8 | #  - python setup.py check (using the readme_renderer extension)
 9 | #     confirms your long_description will render correctly on pypi
10 | #
11 | #  and also to help confirm pull requests to this project.
12 | 
13 | [tox]
14 | envlist = py{27,35,36}
15 | 
16 | [testenv]
17 | basepython =
18 |     py27: python2.7
19 |     py35: python3.5
20 |     py36: python3.6
21 | deps =
22 |     check-manifest
23 |     readme_renderer
24 |     pytest
25 |     requests
26 |     protobuf
27 | commands =
28 |     check-manifest --ignore tox.ini,tests*
29 |     python setup.py check -m -r -s
30 |     py.test tests/test_protobuf.py
31 |     py.test tests/test_annotator.py -k "annotator_"
32 | [flake8]
33 | exclude = .tox,*.egg,build,data
34 | select = E,W,F
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Stanford NLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/tests/test_client.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests that call a running CoreNLPClient.
 3 | """
 4 | import corenlp
 5 | 
 6 | TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
 7 | 
 8 | def test_connect():
 9 |     with corenlp.CoreNLPClient() as client:
10 |         client.ensure_alive()
11 |         assert client.is_active
12 |         assert client.is_alive()
13 | 
14 | def test_annotate():
15 |     with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client:
16 |         ann = client.annotate(TEXT)
17 |         assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
18 | 
19 | def test_update():
20 |     with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client:
21 |         ann = client.annotate(TEXT)
22 |         ann = client.update(ann)
23 |         assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
24 | 
25 | def test_tokensregex():
26 |     with corenlp.CoreNLPClient(annotators='tokenize ssplit ner depparse'.split(), timeout=60000) as client:
27 |         # Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml
28 |         pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
29 |         matches = client.tokensregex(TEXT, pattern)
30 |         assert len(matches["sentences"]) == 1
31 |         assert matches["sentences"][0]["length"] == 1
32 |         assert matches == {
33 |             "sentences": [{
34 |                 "0": {
35 |                     "text": "Chris wrote a simple sentence",
36 |                     "begin": 0,
37 |                     "end": 5,
38 |                     "1": {
39 |                         "text": "Chris",
40 |                         "begin": 0,
41 |                         "end": 1
42 |                         }},
43 |                 "length": 1
44 |                 },]}
45 | 
46 | def test_semgrex():
47 |     with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma ner depparse'.split(), timeout=60000) as client:
48 |         pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
49 |         matches = client.semgrex(TEXT, pattern, to_words=True)
50 |         assert matches == [
51 |                 {
52 |                     "text": "wrote",
53 |                     "begin": 1,
54 |                     "end": 2,
55 |                     "$subject": {
56 |                         "text": "Chris",
57 |                         "begin": 0,
58 |                         "end": 1
59 |                         },
60 |                     "$object": {
61 |                         "text": "sentence",
62 |                         "begin": 4,
63 |                         "end": 5
64 |                         },
65 |                     "sentence": 0,}]
66 | 


--------------------------------------------------------------------------------
/corenlp/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Simple shell program to pipe in 
 5 | """
 6 | 
 7 | import corenlp
 8 | 
 9 | import json
10 | import re
11 | import csv
12 | import sys
13 | from collections import namedtuple, OrderedDict
14 | 
15 | FLOAT_RE = re.compile(r"\d*\.\d+")
16 | INT_RE = re.compile(r"\d+")
17 | 
18 | def dictstr(arg):
19 |     """
20 |     Parse a key=value string as a tuple (key, value) that can be provided as an argument to dict()
21 |     """
22 |     key, value = arg.split("=")
23 | 
24 |     if value.lower() == "true" or value.lower() == "false":
25 |         value = bool(value)
26 |     elif INT_RE.match(value):
27 |         value = int(value)
28 |     elif FLOAT_RE.match(value):
29 |         value = float(value)
30 |     return (key, value)
31 | 
32 | 
33 | def do_annotate(args):
34 |     args.props = dict(args.props) if args.props else {}
35 |     if args.sentence_mode:
36 |         args.props["ssplit.isOneSentence"] = True
37 | 
38 |     with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client:
39 |         for line in args.input:
40 |             if line.startswith("#"): continue
41 | 
42 |             ann = client.annotate(line.strip(), output_format=args.format)
43 | 
44 |             if args.format == "json":
45 |                 if args.sentence_mode:
46 |                     ann = ann["sentences"][0]
47 | 
48 |                 args.output.write(json.dumps(ann))
49 |                 args.output.write("\n")
50 | 
51 | def main():
52 |     import argparse
53 |     parser = argparse.ArgumentParser(description='Annotate data')
54 |     parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file to process; each line contains one document (default: stdin)")
55 |     parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="File to write annotations to (default: stdout)")
56 |     parser.add_argument('-f', '--format', choices=["json",], default="json", help="Output format")
57 |     parser.add_argument('-a', '--annotators', nargs="+", type=str, default=["tokenize ssplit lemma pos"], help="A list of annotators")
58 |     parser.add_argument('-s', '--sentence-mode', action="store_true",help="Assume each line of input is a sentence.")
59 |     parser.add_argument('-v', '--verbose-server', action="store_true",help="Server is made verbose")
60 |     parser.add_argument('-m', '--memory', type=str, default="4G", help="Memory to use for the server")
61 |     parser.add_argument('-p', '--props', nargs="+", type=dictstr, help="Properties as a list of key=value pairs")
62 |     parser.set_defaults(func=do_annotate)
63 | 
64 |     ARGS = parser.parse_args()
65 |     if ARGS.func is None:
66 |         parser.print_help()
67 |         sys.exit(1)
68 |     else:
69 |         ARGS.func(ARGS)
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """A setuptools based setup module.
  2 | 
  3 | See:
  4 | https://packaging.python.org/en/latest/distributing.html
  5 | https://github.com/pypa/sampleproject
  6 | """
  7 | 
  8 | # Always prefer setuptools over distutils
  9 | from setuptools import setup, find_packages
 10 | # To use a consistent encoding
 11 | from codecs import open
 12 | from os import path
 13 | 
 14 | here = path.abspath(path.dirname(__file__))
 15 | 
 16 | # Get the long description from the README file
 17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
 18 |     long_description = f.read()
 19 | 
 20 | setup(
 21 |     name='stanford-corenlp',
 22 | 
 23 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
 24 |     # the version across setup.py and the project code, see
 25 |     # https://packaging.python.org/en/latest/single_source_version.html
 26 |     version='3.9.2',
 27 | 
 28 |     description='Official python interface for Stanford CoreNLP',
 29 |     long_description=long_description,
 30 | 
 31 |     # The project's main homepage.
 32 |     url='https://github.com/stanfordnlp/python-stanford-corenlp',
 33 | 
 34 |     # Author details
 35 |     author='Stanford NLP Group',
 36 |     author_email='chaganty@cs.stanford.edu',
 37 | 
 38 |     # Choose your license
 39 |     license='MIT',
 40 | 
 41 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 42 |     classifiers=[
 43 |         # How mature is this project? Common values are
 44 |         #   3 - Alpha
 45 |         #   4 - Beta
 46 |         #   5 - Production/Stable
 47 |         'Development Status :: 4 - Beta',
 48 | 
 49 |         # Indicate who your project is intended for
 50 |         'Intended Audience :: Developers',
 51 |         'Topic :: Software Development :: Object Brokering',
 52 | 
 53 |         # Pick your license as you wish (should match "license" above)
 54 |         'License :: OSI Approved :: MIT License',
 55 | 
 56 |         # Specify the Python versions you support here. In particular, ensure
 57 |         # that you indicate whether you support Python 2, Python 3 or both.
 58 |         'Programming Language :: Python :: 2',
 59 |         'Programming Language :: Python :: 2.7',
 60 |         'Programming Language :: Python :: 3',
 61 |         'Programming Language :: Python :: 3.3',
 62 |         'Programming Language :: Python :: 3.4',
 63 |         'Programming Language :: Python :: 3.5',
 64 |     ],
 65 | 
 66 |     # What does your project relate to?
 67 |     keywords='corenlp natural-language-processing nlp',
 68 | 
 69 |     # You can just specify the packages manually here if your project is
 70 |     # simple. Or you can use find_packages().
 71 |     packages=find_packages(exclude=['contrib', 'docs', 'tests']),
 72 | 
 73 |     # Alternatively, if you want to distribute just a my_module.py, uncomment
 74 |     # this:
 75 |     #py_modules=["corenlp_protobuf"],
 76 | 
 77 |     # List run-time dependencies here.  These will be installed by pip when
 78 |     # your project is installed. For an analysis of "install_requires" vs pip's
 79 |     # requirements files see:
 80 |     # https://packaging.python.org/en/latest/requirements.html
 81 |     install_requires=['corenlp-protobuf >= 3.8.0', 'requests >= 2.10.0', 'six >= 1.9'],
 82 | 
 83 |     # List additional groups of dependencies here (e.g. development
 84 |     # dependencies). You can install these using the following syntax,
 85 |     # for example:
 86 |     # $ pip install -e .[dev,test]
 87 |     extras_require={
 88 |         'dev': ['check-manifest'],
 89 |         'test': ['coverage'],
 90 |     },
 91 | 
 92 |     # If there are data files included in your packages that need to be
 93 |     # installed, specify them here.  If using Python 2.6 or less, then these
 94 |     # have to be included in MANIFEST.in as well.
 95 |     package_data={
 96 |     },
 97 | 
 98 |     # Although 'package_data' is the preferred approach, in some case you may
 99 |     # need to place data files outside of your packages. See:
100 |     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
101 |     # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
102 |     data_files=[],
103 | 
104 |     # To provide executable scripts, use entry points in preference to the
105 |     # "scripts" keyword. Entry points provide cross-platform support and allow
106 |     # pip to create the appropriate form of executable for the target platform.
107 |     entry_points={
108 |         'console_scripts': [
109 |             'annotate = corenlp.main:main',
110 |         ],
111 |     },
112 | )
113 | 


--------------------------------------------------------------------------------
/tests/test_protobuf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests to read a stored protobuf.
  3 | Also serves as an example of how to parse sentences, tokens, pos, lemma,
  4 | ner, dependencies and mentions.
  5 | 
  6 | The test corresponds to annotations for the following sentence:
  7 |     Chris wrote a simple sentence that he parsed with Stanford CoreNLP.
  8 | """
  9 | 
 10 | import os
 11 | from pytest import fixture
 12 | from corenlp_protobuf import Document, Sentence, Token, DependencyGraph,\
 13 |                              CorefChain
 14 | from corenlp_protobuf import parseFromDelimitedString, writeToDelimitedString, to_text
 15 | 
 16 | 
 17 | # Thext that was annotated
 18 | TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
 19 | 
 20 | 
 21 | @fixture
 22 | def doc_pb():
 23 |     test_dir = os.path.dirname(os.path.abspath(__file__))
 24 |     test_data = os.path.join(test_dir, 'data', 'test.dat')
 25 |     with open(test_data, 'rb') as f:
 26 |         buf = f.read()
 27 |     doc = Document()
 28 |     parseFromDelimitedString(doc, buf)
 29 |     return doc
 30 | 
 31 | def test_parse_protobuf(doc_pb):
 32 |     assert doc_pb.ByteSize() == 4239
 33 | 
 34 | def test_write_protobuf(doc_pb):
 35 |     stream = writeToDelimitedString(doc_pb)
 36 |     buf = stream.getvalue()
 37 |     stream.close()
 38 | 
 39 |     doc_pb_ = Document()
 40 |     parseFromDelimitedString(doc_pb_, buf)
 41 |     assert doc_pb == doc_pb_
 42 | 
 43 | def test_document_text(doc_pb):
 44 |     assert doc_pb.text == TEXT
 45 | 
 46 | 
 47 | def test_sentences(doc_pb):
 48 |     assert len(doc_pb.sentence) == 1
 49 | 
 50 |     sentence = doc_pb.sentence[0]
 51 |     assert isinstance(sentence, Sentence)
 52 |     # check sentence length
 53 |     assert sentence.characterOffsetEnd - sentence.characterOffsetBegin == 67
 54 |     # Note that the sentence text should actually be recovered from the tokens.
 55 |     assert sentence.text == ''
 56 |     assert to_text(sentence) == TEXT[:-1]
 57 | 
 58 | 
 59 | def test_tokens(doc_pb):
 60 |     sentence = doc_pb.sentence[0]
 61 |     tokens = sentence.token
 62 |     assert len(tokens) == 12
 63 |     assert isinstance(tokens[0], Token)
 64 | 
 65 |     # Word
 66 |     words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split()
 67 |     words_ = [t.word for t in tokens]
 68 |     assert  words_ == words
 69 | 
 70 |     # Lemma
 71 |     lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split()
 72 |     lemmas_ = [t.lemma for t in tokens]
 73 |     assert lemmas_ == lemmas
 74 | 
 75 |     # POS
 76 |     pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split()
 77 |     pos_ = [t.pos for t in tokens]
 78 |     assert pos_ == pos
 79 | 
 80 |     # NER
 81 |     ner = "PERSON O O O O O O O O ORGANIZATION O O".split()
 82 |     ner_ = [t.ner for t in tokens]
 83 |     assert ner_ == ner
 84 | 
 85 |     # character offsets
 86 |     begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()]
 87 |     end =   [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()]
 88 |     begin_ = [t.beginChar for t in tokens]
 89 |     end_ = [t.endChar for t in tokens]
 90 |     assert begin_ == begin
 91 |     assert end_ == end
 92 | 
 93 | 
 94 | def test_dependency_parse(doc_pb):
 95 |     """
 96 |     Extract the dependency parse from the annotation.
 97 |     """
 98 |     sentence = doc_pb.sentence[0]
 99 | 
100 |     # You can choose from the following types of dependencies.
101 |     # In general, you'll want enhancedPlusPlus
102 |     assert sentence.basicDependencies.ByteSize() > 0
103 |     assert sentence.enhancedDependencies.ByteSize() > 0
104 |     assert sentence.enhancedPlusPlusDependencies.ByteSize() > 0
105 | 
106 |     tree = sentence.enhancedPlusPlusDependencies
107 |     isinstance(tree, DependencyGraph)
108 |     # Indices are 1-indexd with 0 being the "pseudo root"
109 |     assert tree.root  # 'wrote' is the root. == [2]
110 |     # There are as many nodes as there are tokens.
111 |     assert len(tree.node) == len(sentence.token)
112 | 
113 |     # Enhanced++ depdencies often contain additional edges and are
114 |     # not trees -- here, 'parsed' would also have an edge to
115 |     # 'sentence'
116 |     assert len(tree.edge) == 12
117 | 
118 |     # This edge goes from "wrote" to "Chirs"
119 |     edge = tree.edge[0]
120 |     assert edge.source == 2
121 |     assert edge.target == 1
122 |     assert edge.dep == "nsubj"
123 | 
124 | 
125 | def test_coref_chain(doc_pb):
126 |     """
127 |     Extract the corefence chains from the annotation.
128 |     """
129 |     # Coreference chains span sentences and are stored in the
130 |     # document.
131 |     chains = doc_pb.corefChain
132 | 
133 |     # In this document there is 1 chain with Chris and he.
134 |     assert len(chains) == 1
135 |     chain = chains[0]
136 |     assert isinstance(chain, CorefChain)
137 |     assert chain.mention[0].beginIndex == 0  # 'Chris'
138 |     assert chain.mention[0].endIndex == 1
139 |     assert chain.mention[0].gender == "MALE"
140 | 
141 |     assert chain.mention[1].beginIndex == 6  # 'he'
142 |     assert chain.mention[1].endIndex == 7
143 |     assert chain.mention[1].gender == "MALE"
144 | 
145 |     assert chain.representative == 0  # Head of the chain is 'Chris'
146 | 


--------------------------------------------------------------------------------
/corenlp/annotator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Defines a base class that can be used to annotate.
  3 | """
  4 | import io
  5 | from multiprocessing import Process
  6 | from six.moves.BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
  7 | from six.moves import http_client as HTTPStatus
  8 | 
  9 | from corenlp_protobuf import Document, parseFromDelimitedString, writeToDelimitedString
 10 | 
 11 | class Annotator(Process):
 12 |     """
 13 |     This annotator base class hosts a lightweight server that accepts
 14 |     annotation requests from CoreNLP.
 15 |     Each annotator simply defines 3 functions: requires, provides and annotate.
 16 | 
 17 |     This class takes care of defining appropriate endpoints to interface
 18 |     with CoreNLP.
 19 |     """
 20 |     @property
 21 |     def name(self):
 22 |         """
 23 |         Name of the annotator (used by CoreNLP)
 24 |         """
 25 |         raise NotImplementedError()
 26 | 
 27 |     @property
 28 |     def requires(self):
 29 |         """
 30 |         Requires has to specify all the annotations required before we
 31 |         are called.
 32 |         """
 33 |         raise NotImplementedError()
 34 | 
 35 |     @property
 36 |     def provides(self):
 37 |         """
 38 |         The set of annotations guaranteed to be provided when we are done.
 39 |         NOTE: that these annotations are either fully qualified Java
 40 |         class names or refer to nested classes of
 41 |         edu.stanford.nlp.ling.CoreAnnotations (as is the case below).
 42 |         """
 43 |         raise NotImplementedError()
 44 | 
 45 |     def annotate(self, ann):
 46 |         """
 47 |         @ann: is a protobuf annotation object.
 48 |         Actually populate @ann with tokens.
 49 |         """
 50 |         raise NotImplementedError()
 51 | 
 52 |     @property
 53 |     def properties(self):
 54 |         """
 55 |         Defines a Java property to define this anntoator to CoreNLP.
 56 |         """
 57 |         return {
 58 |             "customAnnotatorClass.{}".format(self.name): "edu.stanford.nlp.pipeline.GenericWebServiceAnnotator",
 59 |             "generic.endpoint": "http://{}:{}".format(self.host, self.port),
 60 |             "generic.requires": ",".join(self.requires),
 61 |             "generic.provides": ",".join(self.provides),
 62 |             }
 63 | 
 64 |     class _Handler(BaseHTTPRequestHandler):
 65 |         annotator = None
 66 | 
 67 |         def __init__(self, request, client_address, server):
 68 |             BaseHTTPRequestHandler.__init__(self, request, client_address, server)
 69 | 
 70 |         def do_GET(self):
 71 |             """
 72 |             Handle a ping request
 73 |             """
 74 |             if not self.path.endswith("/"): self.path += "/"
 75 |             if self.path == "/ping/":
 76 |                 msg = "pong".encode("UTF-8")
 77 | 
 78 |                 self.send_response(HTTPStatus.OK)
 79 |                 self.send_header("Content-Type", "text/application")
 80 |                 self.send_header("Content-Length", len(msg))
 81 |                 self.end_headers()
 82 |                 self.wfile.write(msg)
 83 |             else:
 84 |                 self.send_response(HTTPStatus.BAD_REQUEST)
 85 |                 self.end_headers()
 86 | 
 87 |         def do_POST(self):
 88 |             """
 89 |             Handle an annotate request
 90 |             """
 91 |             if not self.path.endswith("/"): self.path += "/"
 92 |             if self.path == "/annotate/":
 93 |                 # Read message
 94 |                 length = int(self.headers.get('content-length'))
 95 |                 msg = self.rfile.read(length)
 96 | 
 97 |                 # Do the annotation
 98 |                 doc = Document()
 99 |                 parseFromDelimitedString(doc, msg)
100 |                 self.annotator.annotate(doc)
101 | 
102 |                 with io.BytesIO() as stream:
103 |                     writeToDelimitedString(doc, stream)
104 |                     msg = stream.getvalue()
105 | 
106 |                 # write message
107 |                 self.send_response(HTTPStatus.OK)
108 |                 self.send_header("Content-Type", "application/x-protobuf")
109 |                 self.send_header("Content-Length", len(msg))
110 |                 self.end_headers()
111 |                 self.wfile.write(msg)
112 | 
113 |             else:
114 |                 self.send_response(HTTPStatus.BAD_REQUEST)
115 |                 self.end_headers()
116 | 
117 |     def __init__(self, host="", port=8432):
118 |         """
119 |         Launches a server endpoint to communicate with CoreNLP
120 |         """
121 |         Process.__init__(self)
122 |         self.host, self.port = host, port
123 |         self._Handler.annotator = self
124 | 
125 |     def run(self):
126 |         """
127 |         Runs the server using Python's simple HTTPServer.
128 |         TODO: make this multithreaded.
129 |         """
130 |         httpd = HTTPServer((self.host, self.port), self._Handler)
131 |         sa = httpd.socket.getsockname()
132 |         serve_message = "Serving HTTP on {host} port {port} (http://{host}:{port}/) ..."
133 |         print(serve_message.format(host=sa[0], port=sa[1]))
134 |         try:
135 |             httpd.serve_forever()
136 |         except KeyboardInterrupt:
137 |             print("\nKeyboard interrupt received, exiting.")
138 |             httpd.shutdown()
139 | 


--------------------------------------------------------------------------------
/tests/test_annotator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | A test annotator (tokens).
  4 | """
  5 | import six
  6 | 
  7 | import pytest
  8 | import time
  9 | import requests
 10 | import corenlp
 11 | from .happyfuntokenizer import Tokenizer
 12 | 
 13 | class HappyFunTokenizer(Tokenizer, corenlp.Annotator):
 14 |     def __init__(self, preserve_case=False):
 15 |         Tokenizer.__init__(self, preserve_case)
 16 |         corenlp.Annotator.__init__(self)
 17 | 
 18 |     @property
 19 |     def name(self):
 20 |         """
 21 |         Name of the annotator (used by CoreNLP)
 22 |         """
 23 |         return "happyfun"
 24 | 
 25 |     @property
 26 |     def requires(self):
 27 |         """
 28 |         Requires has to specify all the annotations required before we
 29 |         are called.
 30 |         """
 31 |         return []
 32 | 
 33 |     @property
 34 |     def provides(self):
 35 |         """
 36 |         The set of annotations guaranteed to be provided when we are done.
 37 |         NOTE: that these annotations are either fully qualified Java
 38 |         class names or refer to nested classes of
 39 |         edu.stanford.nlp.ling.CoreAnnotations (as is the case below).
 40 |         """
 41 |         return ["TextAnnotation",
 42 |                 "TokensAnnotation",
 43 |                 "TokenBeginAnnotation",
 44 |                 "TokenEndAnnotation",
 45 |                 "CharacterOffsetBeginAnnotation",
 46 |                 "CharacterOffsetEndAnnotation",
 47 |                ]
 48 | 
 49 |     def annotate(self, ann):
 50 |         """
 51 |         @ann: is a protobuf annotation object.
 52 |         Actually populate @ann with tokens.
 53 |         """
 54 |         buf, beg_idx, end_idx = ann.text.lower(), 0, 0
 55 |         for i, word in enumerate(self.tokenize(ann.text)):
 56 |             token = ann.sentencelessToken.add()
 57 |             # These are the bare minimum required for the TokenAnnotation
 58 |             token.word = six.u(word)
 59 |             token.tokenBeginIndex = i
 60 |             token.tokenEndIndex = i+1
 61 | 
 62 |             # Seek into the txt until you can find this word.
 63 |             try:
 64 |                 # Try to update beginning index
 65 |                 beg_idx = buf.index(word, beg_idx)
 66 |             except ValueError:
 67 |                 # Give up -- this will be something random
 68 |                 end_idx = beg_idx + len(word)
 69 | 
 70 |             token.beginChar = beg_idx
 71 |             token.endChar = end_idx
 72 | 
 73 |             beg_idx, end_idx = end_idx, end_idx
 74 | 
 75 | def test_annotator_annotate():
 76 |     cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
 77 |               u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()),
 78 |              (u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
 79 |               u"html entities and other web oddities can be an ácute".split() + [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]),
 80 |              (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
 81 |               u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split())
 82 |             ]
 83 | 
 84 |     annotator = HappyFunTokenizer()
 85 | 
 86 |     for text, tokens in cases:
 87 |         ann = corenlp.Document()
 88 |         ann.text = text
 89 |         annotator.annotate(ann)
 90 |         tokens_ = [t.word for t in ann.sentencelessToken]
 91 |         assert tokens_ == tokens
 92 | 
 93 | def test_annotator_alive():
 94 |     annotator = HappyFunTokenizer()
 95 |     annotator.start()
 96 | 
 97 |     try:
 98 |         time.sleep(2)
 99 |         # Ping the annotator.
100 |         r = requests.get("http://localhost:8432/ping")
101 |         assert r.ok
102 |         assert r.content.decode("utf-8") == "pong"
103 |         r = requests.get("http://localhost:8432/ping/")
104 |         assert r.ok
105 |         assert r.content.decode("utf-8") == "pong"
106 |     finally:
107 |         annotator.terminate()
108 |         annotator.join()
109 | 
110 | # Ignore this test because the CustomAnnotator interface isn't a part of
111 | # StanfordCoreNLP yet.
112 | @pytest.mark.skip(reason="Ignore this test because the CustomAnnotator interface isn't a part of Stanford CoreNLP yet.")
113 | def test_tokenizer():
114 |     cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
115 |               u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()),
116 |              (u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
117 |               u"html entities and other web oddities can be an ácute".split() + [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]),
118 |              (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
119 |               u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split())
120 |             ]
121 | 
122 |     annotator = HappyFunTokenizer()
123 |     annotator.start()
124 | 
125 |     try:
126 |         with corenlp.CoreNLPClient(properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client:
127 |             for text, tokens in cases:
128 |                 ann = client.annotate(text)
129 |                 tokens_ = [t.word for t in ann.sentence[0].token]
130 |                 assert tokens == tokens_
131 |     finally:
132 |         annotator.terminate()
133 |         annotator.join()
134 | 


--------------------------------------------------------------------------------
/tests/happyfuntokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This code implements a basic, Twitter-aware tokenizer.
  6 | 
  7 | A tokenizer is a function that splits a string of text into words. In
  8 | Python terms, we map string and unicode objects into lists of unicode
  9 | objects.
 10 | 
 11 | There is not a single right way to do tokenizing. The best method
 12 | depends on the application.  This tokenizer is designed to be flexible
 13 | and this easy to adapt to new domains and tasks.  The basic logic is
 14 | this:
 15 | 
 16 | 1. The tuple regex_strings defines a list of regular expression
 17 |    strings.
 18 | 
 19 | 2. The regex_strings strings are put, in order, into a compiled
 20 |    regular expression object called word_re.
 21 | 
 22 | 3. The tokenization is done by word_re.findall(s), where s is the
 23 |    user-supplied string, inside the tokenize() method of the class
 24 |    Tokenizer.
 25 | 
 26 | 4. When instantiating Tokenizer objects, there is a single option:
 27 |    preserve_case.  By default, it is set to True. If it is set to
 28 |    False, then the tokenizer will downcase everything except for
 29 |    emoticons.
 30 | 
 31 | The __main__ method illustrates by tokenizing a few examples.
 32 | 
 33 | I've also included a Tokenizer method tokenize_random_tweet(). If the
 34 | twitter library is installed (http://code.google.com/p/python-twitter/)
 35 | and Twitter is cooperating, then it should tokenize a random
 36 | English-language tweet.
 37 | """
 38 | 
 39 | __author__ = "Christopher Potts"
 40 | __copyright__ = "Copyright 2011, Christopher Potts"
 41 | __credits__ = []
 42 | __license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
 43 | __version__ = "1.0"
 44 | __maintainer__ = "Christopher Potts"
 45 | __email__ = "See the author's website"
 46 | 
 47 | ######################################################################
 48 | 
 49 | import re
 50 | from six.moves import html_entities
 51 | 
 52 | ######################################################################
 53 | # The following strings are components in the regular expression
 54 | # that is used for tokenizing. It's important that phone_number
 55 | # appears first in the final regex (since it can contain whitespace).
 56 | # It also could matter that tags comes after emoticons, due to the
 57 | # possibility of having text like
 58 | #
 59 | #     <:| and some text >:)
 60 | #
 61 | # Most imporatantly, the final element should always be last, since it
 62 | # does a last ditch whitespace-based tokenization of whatever is left.
 63 | 
 64 | # This particular element is used in a couple ways, so we define it
 65 | # with a name:
 66 | emoticon_string = r"""
 67 |     (?:
 68 |       [<>]?
 69 |       [:;=8]                     # eyes
 70 |       [\-o\*\']?                 # optional nose
 71 |       [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
 72 |       |
 73 |       [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
 74 |       [\-o\*\']?                 # optional nose
 75 |       [:;=8]                     # eyes
 76 |       [<>]?
 77 |     )"""
 78 | 
 79 | # The components of the tokenizer:
 80 | regex_strings = (
 81 |     # Phone numbers:
 82 |     r"""
 83 |     (?:
 84 |       (?:            # (international)
 85 |         \+?[01]
 86 |         [\-\s.]*
 87 |       )?
 88 |       (?:            # (area code)
 89 |         [\(]?
 90 |         \d{3}
 91 |         [\-\s.\)]*
 92 |       )?
 93 |       \d{3}          # exchange
 94 |       [\-\s.]*
 95 |       \d{4}          # base
 96 |     )"""
 97 |     ,
 98 |     # Emoticons:
 99 |     emoticon_string
100 |     ,
101 |     # HTML tags:
102 |     r"""<[^>]+>"""
103 |     ,
104 |     # Twitter username:
105 |     r"""(?:@[\w_]+)"""
106 |     ,
107 |     # Twitter hashtags:
108 |     r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
109 |     ,
110 |     # Remaining word types:
111 |     r"""
112 |     (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
113 |     |
114 |     (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
115 |     |
116 |     (?:[\w_]+)                     # Words without apostrophes or dashes.
117 |     |
118 |     (?:\.(?:\s*\.){1,})            # Ellipsis dots.
119 |     |
120 |     (?:\S)                         # Everything else that isn't whitespace.
121 |     """
122 |     )
123 | 
124 | ######################################################################
125 | # This is the core tokenizing regex:
126 | 
127 | word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
128 | 
129 | # The emoticon string gets its own regex so that we can preserve case for them as needed:
130 | emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
131 | 
132 | # These are for regularizing HTML entities to Unicode:
133 | html_entity_digit_re = re.compile(r"&#\d+;")
134 | html_entity_alpha_re = re.compile(r"&\w+;")
135 | amp = "&amp;"
136 | 
137 | ######################################################################
138 | 
139 | class Tokenizer:
140 |     def __init__(self, preserve_case=False):
141 |         self.preserve_case = preserve_case
142 | 
143 |     def tokenize(self, s):
144 |         """
145 |         Argument: s -- any string or unicode object
146 |         Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
147 |         """
148 |         # Try to ensure unicode:
149 |         try:
150 |             s = str(s)
151 |         except UnicodeDecodeError:
152 |             s = str(s).encode('string_escape')
153 |             s = str(s)
154 |         # Fix HTML character entitites:
155 |         s = self.__html2unicode(s)
156 |         # Tokenize:
157 |         words = word_re.findall(s)
158 |         # Possible alter the case, but avoid changing emoticons like :D into :d:
159 |         if not self.preserve_case:
160 |             words = list(map((lambda x : x if emoticon_re.search(x) else x.lower()), words))
161 |         return words
162 | 
163 |     def tokenize_random_tweet(self):
164 |         """
165 |         If the twitter library is installed and a twitter connection
166 |         can be established, then tokenize a random tweet.
167 |         """
168 |         try:
169 |             import twitter
170 |         except ImportError:
171 |             print("Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/")
172 |         from random import shuffle
173 |         api = twitter.Api()
174 |         tweets = api.GetPublicTimeline()
175 |         if tweets:
176 |             for tweet in tweets:
177 |                 if tweet.user.lang == 'en':
178 |                     return self.tokenize(tweet.text)
179 |         else:
180 |             raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")
181 | 
182 |     def __html2unicode(self, s):
183 |         """
184 |         Internal metod that seeks to replace all the HTML entities in
185 |         s with their corresponding unicode characters.
186 |         """
187 |         # First the digits:
188 |         ents = set(html_entity_digit_re.findall(s))
189 |         if len(ents) > 0:
190 |             for ent in ents:
191 |                 entnum = ent[2:-1]
192 |                 try:
193 |                     entnum = int(entnum)
194 |                     s = s.replace(ent, chr(entnum))
195 |                 except:
196 |                     pass
197 |         # Now the alpha versions:
198 |         ents = set(html_entity_alpha_re.findall(s))
199 |         ents = list(filter((lambda x : x != amp), ents))
200 |         for ent in ents:
201 |             entname = ent[1:-1]
202 |             try:
203 |                 s = s.replace(ent, chr(html_entities.name2codepoint[entname]))
204 |             except:
205 |                 pass
206 |             s = s.replace(amp, " and ")
207 |         return s
208 | 
209 | ###############################################################################
210 | 
211 | if __name__ == '__main__':
212 |     tok = Tokenizer(preserve_case=False)
213 |     samples = (
214 |         "RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
215 |         "HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
216 |         "It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace."
217 |         )
218 | 
219 |     for s in samples:
220 |         print("======================================================================")
221 |         print(s)
222 |         tokenized = tok.tokenize(s)
223 |         print("\n".join(tokenized))
224 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Stanford CoreNLP Python Interface
  2 | =================================
  3 | 
  4 | **NOTE:** This package is now deprecated. Please use the `stanza <https://github.com/stanfordnlp/stanza>`_ package instead.
  5 | 
  6 | 
  7 | .. image:: https://travis-ci.org/stanfordnlp/python-stanford-corenlp.svg?branch=master
  8 |     :target: https://travis-ci.org/stanfordnlp/python-stanford-corenlp
  9 | 
 10 | This package contains a python interface for `Stanford CoreNLP
 11 | <https://github.com/stanfordnlp/CoreNLP>`_ that contains a reference
 12 | implementation to interface with the `Stanford CoreNLP server
 13 | <https://stanfordnlp.github.io/CoreNLP/corenlp-server.html>`_.
 14 | The package also contains a base class to expose a python-based annotation
 15 | provider (e.g. your favorite neural NER system) to the CoreNLP
 16 | pipeline via a lightweight service.
 17 | 
 18 | To use the package, first download the `official java CoreNLP release 
 19 | <https://stanfordnlp.github.io/CoreNLP/#download>`_, unzip it, and define an environment
 20 | variable :code:`$CORENLP_HOME` that points to the unzipped directory.
 21 | 
 22 | You can also install this package from `PyPI <https://pypi.python.org/pypi/stanford-corenlp/>`_ using :code:`pip install stanford-corenlp` 
 23 | 
 24 | ----
 25 | 
 26 | Command Line Usage
 27 | ------------------
 28 | Probably the easiest way to use this package is through the `annotate` command-line utility::
 29 | 
 30 |     usage: annotate [-h] [-i INPUT] [-o OUTPUT] [-f {json}]
 31 |                     [-a ANNOTATORS [ANNOTATORS ...]] [-s] [-v] [-m MEMORY]
 32 |                     [-p PROPS [PROPS ...]]
 33 | 
 34 |     Annotate data
 35 | 
 36 |     optional arguments:
 37 |       -h, --help            show this help message and exit
 38 |       -i INPUT, --input INPUT
 39 |                             Input file to process; each line contains one document
 40 |                             (default: stdin)
 41 |       -o OUTPUT, --output OUTPUT
 42 |                             File to write annotations to (default: stdout)
 43 |       -f {json}, --format {json}
 44 |                             Output format
 45 |       -a ANNOTATORS [ANNOTATORS ...], --annotators ANNOTATORS [ANNOTATORS ...]
 46 |                             A list of annotators
 47 |       -s, --sentence-mode   Assume each line of input is a sentence.
 48 |       -v, --verbose-server  Server is made verbose
 49 |       -m MEMORY, --memory MEMORY
 50 |                             Memory to use for the server
 51 |       -p PROPS [PROPS ...], --props PROPS [PROPS ...]
 52 |                             Properties as a list of key=value pairs
 53 | 
 54 | 
 55 | We recommend using `annotate` in conjuction with the wonderful `jq`
 56 | command to process the output. As an example, given a file with a
 57 | sentence on each line, the following command produces an equivalent
 58 | space-separated tokens::
 59 | 
 60 |     cat file.txt | annotate -s -a tokenize | jq '[.tokens[].originalText]' > tokenized.txt
 61 | 
 62 | 
 63 | Annotation Server Usage
 64 | -----------------------
 65 | 
 66 | .. code-block:: python
 67 | 
 68 |   import corenlp
 69 | 
 70 |   text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP."
 71 | 
 72 |   # We assume that you've downloaded Stanford CoreNLP and defined an environment
 73 |   # variable $CORENLP_HOME that points to the unzipped directory.
 74 |   # The code below will launch StanfordCoreNLPServer in the background
 75 |   # and communicate with the server to annotate the sentence.
 76 |   with corenlp.CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split()) as client:
 77 |     ann = client.annotate(text)
 78 | 
 79 |   # You can access annotations using ann.
 80 |   sentence = ann.sentence[0]
 81 | 
 82 |   # The corenlp.to_text function is a helper function that
 83 |   # reconstructs a sentence from tokens.
 84 |   assert corenlp.to_text(sentence) == text
 85 | 
 86 |   # You can access any property within a sentence.
 87 |   print(sentence.text)
 88 | 
 89 |   # Likewise for tokens
 90 |   token = sentence.token[0]
 91 |   print(token.lemma)
 92 | 
 93 |   # Use tokensregex patterns to find who wrote a sentence.
 94 |   pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
 95 |   matches = client.tokensregex(text, pattern)
 96 |   # sentences contains a list with matches for each sentence.
 97 |   assert len(matches["sentences"]) == 1
 98 |   # length tells you whether or not there are any matches in this
 99 |   assert matches["sentences"][0]["length"] == 1
100 |   # You can access matches like most regex groups.
101 |   matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
102 |   matches["sentences"][1]["0"]["1"]["text"] == "Chris"
103 | 
104 |   # Use semgrex patterns to directly find who wrote what.
105 |   pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
106 |   matches = client.semgrex(text, pattern)
107 |   # sentences contains a list with matches for each sentence.
108 |   assert len(matches["sentences"]) == 1
109 |   # length tells you whether or not there are any matches in this
110 |   assert matches["sentences"][0]["length"] == 1
111 |   # You can access matches like most regex groups.
112 |   matches["sentences"][1]["0"]["text"] == "wrote"
113 |   matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
114 |   matches["sentences"][1]["0"]["$object"]["text"] == "sentence"
115 | 
116 | See `test_client.py` and `test_protobuf.py` for more examples. Props to
117 | @dan-zheng for tokensregex/semgrex support.
118 | 
119 | 
120 | Annotation Service Usage
121 | ------------------------
122 | 
123 | *NOTE*: The annotation service allows users to provide a custom
124 | annotator to be used by the CoreNLP pipeline. Unfortunately, it relies
125 | on experimental code internal to the Stanford CoreNLP project is not yet
126 | available for public use.
127 | 
128 | .. code-block:: python
129 | 
130 |     import corenlp
131 |     from .happyfuntokenizer import Tokenizer
132 | 
133 |     class HappyFunTokenizer(Tokenizer, corenlp.Annotator):
134 |         def __init__(self, preserve_case=False):
135 |             Tokenizer.__init__(self, preserve_case)
136 |             corenlp.Annotator.__init__(self)
137 | 
138 |         @property
139 |         def name(self):
140 |             """
141 |             Name of the annotator (used by CoreNLP)
142 |             """
143 |             return "happyfun"
144 | 
145 |         @property
146 |         def requires(self):
147 |             """
148 |             Requires has to specify all the annotations required before we
149 |             are called.
150 |             """
151 |             return []
152 | 
153 |         @property
154 |         def provides(self):
155 |             """
156 |             The set of annotations guaranteed to be provided when we are done.
157 |             NOTE: that these annotations are either fully qualified Java
158 |             class names or refer to nested classes of
159 |             edu.stanford.nlp.ling.CoreAnnotations (as is the case below).
160 |             """
161 |             return ["TextAnnotation",
162 |                     "TokensAnnotation",
163 |                     "TokenBeginAnnotation",
164 |                     "TokenEndAnnotation",
165 |                     "CharacterOffsetBeginAnnotation",
166 |                     "CharacterOffsetEndAnnotation",
167 |                    ]
168 | 
169 |         def annotate(self, ann):
170 |             """
171 |             @ann: is a protobuf annotation object.
172 |             Actually populate @ann with tokens.
173 |             """
174 |             buf, beg_idx, end_idx = ann.text.lower(), 0, 0
175 |             for i, word in enumerate(self.tokenize(ann.text)):
176 |                 token = ann.sentencelessToken.add()
177 |                 # These are the bare minimum required for the TokenAnnotation
178 |                 token.word = word
179 |                 token.tokenBeginIndex = i
180 |                 token.tokenEndIndex = i+1
181 | 
182 |                 # Seek into the txt until you can find this word.
183 |                 try:
184 |                     # Try to update beginning index
185 |                     beg_idx = buf.index(word, beg_idx)
186 |                 except ValueError:
187 |                     # Give up -- this will be something random
188 |                     end_idx = beg_idx + len(word)
189 | 
190 |                 token.beginChar = beg_idx
191 |                 token.endChar = end_idx
192 | 
193 |                 beg_idx, end_idx = end_idx, end_idx
194 | 
195 |     annotator = HappyFunTokenizer()
196 |     # Calling .start() will launch the annotator as a service running on
197 |     # port 8432 by default.
198 |     annotator.start()
199 | 
200 |     # annotator.properties contains all the right properties for
201 |     # Stanford CoreNLP to use this annotator. 
202 |     with corenlp.CoreNLPClient(properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client:
203 |         ann = client.annotate("RT @ #happyfuncoding: this is a typical Twitter tweet :-)")
204 | 
205 |         tokens = [t.word for t in ann.sentence[0].token]
206 |         print(tokens)
207 | 
208 | 
209 | See `test_annotator.py` for more examples.
210 | 


--------------------------------------------------------------------------------
/corenlp/client.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | Python CoreNLP: a server based interface to Java CoreNLP.
  3 | """
  4 | import io
  5 | import os
  6 | import logging
  7 | import json
  8 | import shlex
  9 | import subprocess
 10 | import time
 11 | import sys
 12 | import datetime
 13 | 
 14 | from six.moves.urllib.parse import urlparse
 15 | 
 16 | import requests
 17 | 
 18 | from corenlp_protobuf import Document, parseFromDelimitedString, writeToDelimitedString, to_text
 19 | __author__ = 'arunchaganty, kelvinguu, vzhong, wmonroe4'
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | class AnnotationException(Exception):
 24 |     """
 25 |     Exception raised when there was an error communicating with the CoreNLP server.
 26 |     """
 27 |     pass
 28 | 
 29 | class TimeoutException(AnnotationException):
 30 |     """
 31 |     Exception raised when the CoreNLP server timed out.
 32 |     """
 33 |     pass
 34 | 
 35 | class ShouldRetryException(Exception):
 36 |     """
 37 |     Exception raised if the service should retry the request.
 38 |     """
 39 |     pass
 40 | 
 41 | class PermanentlyFailedException(Exception):
 42 |     """
 43 |     Exception raised if the service should retry the request.
 44 |     """
 45 |     pass
 46 | 
 47 | class RobustService(object):
 48 |     """
 49 |     Service that resuscitates itself if it is not available.
 50 |     """
 51 |     TIMEOUT = 15
 52 | 
 53 |     def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,
 54 |                  stderr=sys.stderr, be_quiet=False):
 55 |         self.start_cmd = start_cmd and shlex.split(start_cmd)
 56 |         self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
 57 |         self.endpoint = endpoint
 58 |         self.stdout = stdout
 59 |         self.stderr = stderr
 60 | 
 61 |         self.server = None
 62 |         self.is_active = False
 63 |         self.be_quiet = be_quiet
 64 | 
 65 |     def is_alive(self):
 66 |         try:
 67 |             return requests.get(self.endpoint + "/ping").ok
 68 |         except requests.exceptions.ConnectionError as e:
 69 |             raise ShouldRetryException(e)
 70 | 
 71 |     def start(self):
 72 |         if self.start_cmd:
 73 |             if self.be_quiet:
 74 |                 # Issue #26: subprocess.DEVNULL isn't supported in python 2.7.
 75 |                 stderr = open(os.devnull, 'w')
 76 |             else:
 77 |                 stderr = self.stderr
 78 |             self.server = subprocess.Popen(self.start_cmd,
 79 |                                            stderr=stderr,
 80 |                                            stdout=stderr)
 81 | 
 82 |     def stop(self):
 83 |         if self.server:
 84 |             self.server.kill()
 85 |         if self.stop_cmd:
 86 |             subprocess.run(self.stop_cmd, check=True)
 87 |         self.is_active = False
 88 | 
 89 |     def __enter__(self):
 90 |         self.start()
 91 |         return self
 92 | 
 93 |     def __exit__(self, _, __, ___):
 94 |         self.stop()
 95 | 
 96 |     def ensure_alive(self):
 97 |         # Check if the service is active and alive
 98 |         if self.is_active:
 99 |             try:
100 |                 return self.is_alive()
101 |             except ShouldRetryException:
102 |                 pass
103 | 
104 |         # If not, try to start up the service.
105 |         if self.server is None:
106 |             self.start()
107 | 
108 |         # Wait for the service to start up.
109 |         start_time = time.time()
110 |         while True:
111 |             try:
112 |                 if self.is_alive():
113 |                     break
114 |             except ShouldRetryException:
115 |                 pass
116 | 
117 |             if time.time() - start_time < self.TIMEOUT:
118 |                 time.sleep(1)
119 |             else:
120 |                 raise PermanentlyFailedException("Timed out waiting for service to come alive.")
121 | 
122 |         # At this point we are guaranteed that the service is alive.
123 |         self.is_active = True
124 | 
125 | class CoreNLPClient(RobustService):
126 |     """
127 |     A CoreNLP client to the Stanford CoreNLP server.
128 |     """
129 |     DEFAULT_ANNOTATORS = "tokenize ssplit lemma pos ner depparse".split()
130 |     DEFAULT_PROPERTIES = {}
131 |     DEFAULT_OUTPUT_FORMAT = "serialized"
132 | 
133 |     def __init__(self, start_server=True,
134 |                  endpoint="http://localhost:9000",
135 |                  timeout=15000,
136 |                  threads=5,
137 |                  annotators=None,
138 |                  properties=None,
139 |                  output_format=None,
140 |                  stdout=sys.stdout,
141 |                  stderr=sys.stderr,
142 |                  memory="4G",
143 |                  be_quiet=True,
144 |                  max_char_length=100000
145 |                 ):
146 |         if isinstance(annotators, str):
147 |             annotators = annotators.split()
148 | 
149 |         if start_server:
150 |             host, port = urlparse(endpoint).netloc.split(":")
151 |             assert host == "localhost", "If starting a server, endpoint must be localhost"
152 | 
153 |             assert os.getenv("CORENLP_HOME") is not None, "Please define $CORENLP_HOME where your CoreNLP Java checkout is"
154 |             start_cmd = "java -Xmx{memory} -cp '{corenlp_home}/*'  edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length}".format(
155 |                 corenlp_home=os.getenv("CORENLP_HOME"),
156 |                 port=port,
157 |                 memory=memory,
158 |                 timeout=timeout,
159 |                 threads=threads,
160 |                 max_char_length=max_char_length)
161 |             stop_cmd = None
162 |         else:
163 |             start_cmd = stop_cmd = None
164 | 
165 |         super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
166 |                                             stdout, stderr, be_quiet)
167 |         self.timeout = timeout
168 |         self.default_annotators = annotators or self.DEFAULT_ANNOTATORS
169 |         self.default_properties = properties or self.DEFAULT_PROPERTIES
170 |         self.default_output_format = output_format or self.DEFAULT_OUTPUT_FORMAT
171 | 
172 |     def _request(self, buf, properties, date=None):
173 |         """Send a request to the CoreNLP server.
174 | 
175 |         :param (str | unicode) text: raw text for the CoreNLPServer to parse
176 |         :param (dict) properties: properties that the server expects
177 |         :param (str) date: reference date of document, used by server to set docDate - expects YYYY-MM-DD
178 |         :return: request result
179 |         """
180 |         self.ensure_alive()
181 | 
182 |         try:
183 |             input_format = properties.get("inputFormat", "text")
184 |             if input_format == "text":
185 |                 ctype = "text/plain; charset=utf-8"
186 |             elif input_format == "serialized":
187 |                 ctype = "application/x-protobuf"
188 |             else:
189 |                 raise ValueError("Unrecognized inputFormat " + input_format)
190 | 
191 |             if date:
192 |                 params = {'properties': str(properties),'date': str(date)}
193 |             else:
194 |                 params = {'properties': str(properties)}
195 | 
196 |             r = requests.post(self.endpoint,
197 |                               params=params,
198 |                               data=buf, headers={'content-type': ctype},
199 |                               timeout=(self.timeout*2)/1000)
200 |             r.raise_for_status()
201 |             return r
202 |         except requests.HTTPError as e:
203 |             if r.text == "CoreNLP request timed out. Your document may be too long.":
204 |                 raise TimeoutException(r.text)
205 |             else:
206 |                 raise AnnotationException(r.text)
207 | 
208 |     def annotate(self, text, annotators=None, output_format=None, properties=None, date=None):
209 |         """Send a request to the CoreNLP server.
210 | 
211 |         :param (str | unicode) text: raw text for the CoreNLPServer to parse
212 |         :param (list | string) annotators: list of annotators to use
213 |         :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
214 |         :param (dict) properties: properties that the server expects
215 |         :return: request result
216 |         """
217 |         # set properties for server call
218 |         if properties is None:
219 |             properties = self.default_properties
220 |             properties.update({
221 |                 'annotators': ','.join(annotators or self.default_annotators),
222 |                 'inputFormat': 'text',
223 |                 'outputFormat': self.default_output_format,
224 |                 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
225 |             })
226 |         elif "annotators" not in properties:
227 |             properties.update({'annotators': ','.join(annotators or self.default_annotators)})
228 |         # if an output_format is specified, use that to override
229 |         if output_format is not None:
230 |             properties["outputFormat"] = output_format
231 |         # make the request
232 |         r = self._request(text.encode('utf-8'), properties, date)
233 |         # customize what is returned based outputFormat
234 |         if properties["outputFormat"] == "serialized":
235 |             doc = Document()
236 |             parseFromDelimitedString(doc, r.content)
237 |             return doc
238 |         elif properties["outputFormat"] == "json":
239 |             return r.json()
240 |         elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
241 |             return r.text
242 |         else:
243 |             return r
244 | 
245 |     def update(self, doc, annotators=None, properties=None, date=None):
246 |         if properties is None:
247 |             properties = self.default_properties
248 |             properties.update({
249 |                 'annotators': ','.join(annotators or self.default_annotators),
250 |                 'inputFormat': 'serialized',
251 |                 'outputFormat': 'serialized',
252 |                 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
253 |             })
254 |         with io.BytesIO() as stream:
255 |             writeToDelimitedString(doc, stream)
256 |             msg = stream.getvalue()
257 | 
258 |         r = self._request(msg, properties, date)
259 |         doc = Document()
260 |         parseFromDelimitedString(doc, r.content)
261 |         return doc
262 | 
263 |     def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
264 |         # this is required for some reason
265 |         matches = self.__regex('/tokensregex', text, pattern, filter, annotators, properties)
266 |         if to_words:
267 |             matches = regex_matches_to_indexed_words(matches)
268 |         return matches
269 | 
270 |     def semgrex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
271 |         matches = self.__regex('/semgrex', text, pattern, filter, annotators, properties)
272 |         if to_words:
273 |             matches = regex_matches_to_indexed_words(matches)
274 |         return matches
275 | 
276 |     def tregrex(self, text, pattern, filter=False, annotators=None, properties=None):
277 |         return self.__regex('/tregex', text, pattern, filter, annotators, properties)
278 | 
279 |     def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
280 |         """Send a regex-related request to the CoreNLP server.
281 |         :param (str | unicode) path: the path for the regex endpoint
282 |         :param text: raw text for the CoreNLPServer to apply the regex
283 |         :param (str | unicode) pattern: regex pattern
284 |         :param (bool) filter: option to filter sentences that contain matches, if false returns matches
285 |         :param properties: option to filter sentences that contain matches, if false returns matches
286 |         :return: request result
287 |         """
288 |         self.ensure_alive()
289 |         if properties is None:
290 |             properties = self.default_properties
291 |             properties.update({
292 |                 'annotators': ','.join(annotators or self.default_annotators),
293 |                 'inputFormat': 'text',
294 |                 'outputFormat': self.default_output_format,
295 |                 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
296 |             })
297 |         elif "annotators" not in properties:
298 |             properties.update({'annotators': ','.join(annotators or self.default_annotators)})
299 | 
300 |         # HACK: For some stupid reason, CoreNLPServer will timeout if we
301 |         # need to annotate something from scratch. So, we need to call
302 |         # this to ensure that the _regex call doesn't timeout.
303 |         self.annotate(text, properties=properties)
304 | 
305 |         try:
306 |             # Error occurs unless put properties in params
307 |             input_format = properties.get("inputFormat", "text")
308 |             if input_format == "text":
309 |                 ctype = "text/plain; charset=utf-8"
310 |             elif input_format == "serialized":
311 |                 ctype = "application/x-protobuf"
312 |             else:
313 |                 raise ValueError("Unrecognized inputFormat " + input_format)
314 |             # change request method from `get` to `post` as required by CoreNLP
315 |             r = requests.post(
316 |                 self.endpoint + path, params={
317 |                     'pattern': pattern,
318 |                     'filter': filter,
319 |                     'properties': str(properties)
320 |                 }, data=text,
321 |                     headers={'content-type': ctype},
322 |                     timeout=(self.timeout*2)/1000,
323 |                     )
324 |             r.raise_for_status()
325 |             return json.loads(r.text)
326 |         except requests.HTTPError as e:
327 |             if r.text.startswith("Timeout"):
328 |                 raise TimeoutException(r.text)
329 |             else:
330 |                 raise AnnotationException(r.text)
331 |         except json.JSONDecodeError:
332 |             raise AnnotationException(r.text)
333 | 
334 | def regex_matches_to_indexed_words(matches):
335 |     """Transforms tokensregex and semgrex matches to indexed words.
336 |     :param matches: unprocessed regex matches
337 |     :return: flat array of indexed words
338 |     """
339 |     words = [dict(v, **dict([('sentence', i)]))
340 |              for i, s in enumerate(matches['sentences'])
341 |              for k, v in s.items() if k != 'length']
342 |     return words
343 | 
344 | __all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]
345 | 


--------------------------------------------------------------------------------
/doc/CoreNLP.proto:
--------------------------------------------------------------------------------
  1 | package edu.stanford.nlp.pipeline;
  2 | 
  3 | option java_package = "edu.stanford.nlp.pipeline";
  4 | option java_outer_classname = "CoreNLPProtos";
  5 | 
  6 | //
  7 | // From JAVANLP_HOME, you can build me with the command:
  8 | //
  9 | //  protoc -I=projects/core/src/edu/stanford/nlp/pipeline/ --java_out=projects/core/src projects/core/src/edu/stanford/nlp/pipeline/CoreNLP.proto
 10 | //
 11 | 
 12 | //
 13 | // An enumeration for the valid languages allowed in CoreNLP
 14 | //
 15 | enum Language {
 16 |   Unknown  = 0;
 17 |   Any      = 1;
 18 |   Arabic   = 2;
 19 |   Chinese  = 3;
 20 |   English  = 4;
 21 |   German   = 5;
 22 |   French   = 6;
 23 |   Hebrew   = 7;
 24 |   Spanish  = 8;
 25 |   UniversalEnglish = 9;
 26 | }
 27 | 
 28 | //
 29 | // A document; that is, the equivalent of an Annotation.
 30 | //
 31 | message Document {
 32 |   required string     text        = 1;
 33 |   repeated Sentence   sentence    = 2;
 34 |   repeated CorefChain corefChain  = 3;
 35 |   optional string     docID       = 4;
 36 |   optional string     docDate     = 7;
 37 |   optional uint64     calendar    = 8;
 38 | 
 39 |   /**
 40 |    * A peculiar field, for the corner case when a Document is
 41 |    * serialized without any sentences. Otherwise
 42 |    */
 43 |   repeated Token      sentencelessToken = 5;
 44 | 
 45 |   repeated Quote      quote = 6;
 46 |   /**
 47 |    * This field is for entity mentions across the document.
 48 |    */
 49 |   repeated NERMention mentions = 9;
 50 | 
 51 |   extensions 100 to 255;
 52 | }
 53 | 
 54 | //
 55 | // The serialized version of a CoreMap representing a sentence.
 56 | //
 57 | message Sentence {
 58 |   repeated Token            token                               = 1;
 59 |   required uint32           tokenOffsetBegin                    = 2;
 60 |   required uint32           tokenOffsetEnd                      = 3;
 61 |   optional uint32           sentenceIndex                       = 4;
 62 |   optional uint32           characterOffsetBegin                = 5;
 63 |   optional uint32           characterOffsetEnd                  = 6;
 64 |   optional ParseTree        parseTree                           = 7;
 65 |   optional ParseTree        binarizedParseTree                  = 31;
 66 |   optional ParseTree        annotatedParseTree                  = 32;
 67 |   optional string           sentiment                           = 33;
 68 |   repeated ParseTree        kBestParseTrees                     = 34;
 69 |   optional DependencyGraph  basicDependencies                   = 8;
 70 |   optional DependencyGraph  collapsedDependencies               = 9;
 71 |   optional DependencyGraph  collapsedCCProcessedDependencies    = 10;
 72 |   optional DependencyGraph  alternativeDependencies             = 13;
 73 |   repeated RelationTriple   openieTriple                        = 14;   // The OpenIE triples in the sentence
 74 |   repeated RelationTriple   kbpTriple                           = 16;   // The KBP triples in this sentence
 75 |   repeated SentenceFragment entailedSentence                    = 15;   // The entailed sentences, by natural logic
 76 |   optional DependencyGraph  enhancedDependencies                = 17;
 77 |   optional DependencyGraph  enhancedPlusPlusDependencies        = 18;
 78 | 
 79 |   optional uint32           paragraph                           = 11;
 80 | 
 81 |   optional string           text                                = 12;   // Only needed if we're only saving the sentence.
 82 | 
 83 | 
 84 |   // Fields set by other annotators in CoreNLP
 85 |   optional bool            hasRelationAnnotations              = 51;
 86 |   repeated Entity          entity                              = 52;
 87 |   repeated Relation        relation                            = 53;
 88 |   optional bool            hasNumerizedTokensAnnotation        = 54;
 89 |   repeated NERMention      mentions                            = 55;
 90 |   repeated Mention         mentionsForCoref                    = 56;
 91 |   optional bool            hasCorefMentionsAnnotation          = 57;
 92 | 
 93 |   optional string          sentenceID                          = 58;  // Useful when storing sentences (e.g. ForEach)
 94 | 
 95 |   extensions 100 to 255;
 96 | }
 97 | 
 98 | //
 99 | // The serialized version of a Token (a CoreLabel).
100 | //
101 | message Token {
102 |   // Fields set by the default annotators [new CoreNLP(new Properties())]
103 |   required string word              = 1;    // the word's gloss (post-tokenization)
104 |   optional string pos               = 2;    // The word's part of speech tag
105 |   optional string value             = 3;    // The word's 'value', (e.g., parse tree node)
106 |   optional string category          = 4;    // The word's 'category' (e.g., parse tree node)
107 |   optional string before            = 5;    // The whitespace/xml before the token
108 |   optional string after             = 6;    // The whitespace/xml after the token
109 |   optional string originalText      = 7;    // The original text for this token
110 |   optional string ner               = 8;    // The word's NER tag
111 |   optional string normalizedNER     = 9;    // The word's normalized NER tag
112 |   optional string lemma             = 10;   // The word's lemma
113 |   optional uint32 beginChar         = 11;   // The character offset begin, in the document
114 |   optional uint32 endChar           = 12;   // The character offset end, in the document
115 |   optional uint32 utterance         = 13;   // The utterance tag used in dcoref
116 |   optional string speaker           = 14;   // The speaker speaking this word
117 |   optional uint32 beginIndex        = 15;   // The begin index of, e.g., a span
118 |   optional uint32 endIndex          = 16;   // The begin index of, e.g., a span
119 |   optional uint32 tokenBeginIndex   = 17;   // The begin index of the token
120 |   optional uint32 tokenEndIndex     = 18;   // The end index of the token
121 |   optional Timex  timexValue        = 19;   // The time this word refers to
122 |   optional bool   hasXmlContext     = 21;   // Used by clean xml annotator
123 |   repeated string xmlContext        = 22;   // Used by clean xml annotator
124 |   optional uint32 corefClusterID    = 23;   // The [primary] cluster id for this token
125 |   optional string answer            = 24;   // A temporary annotation which is occasionally left in
126 |   //  optional string projectedCategory = 25;   // The syntactic category of the maximal constituent headed by the word. Not used anywhere, so deleted.
127 |   optional uint32    headWordIndex  = 26;   // The index of the head word of this word.
128 |   optional Operator  operator       = 27;   // If this is an operator, which one is it and what is its scope (as per Natural Logic)?
129 |   optional Polarity  polarity       = 28;   // The polarity of this word, according to Natural Logic
130 |   optional Span      span           = 29;   // The span of a leaf node of a tree
131 |   optional string    sentiment      = 30;   // The final sentiment of the sentence
132 |   optional int32     quotationIndex = 31;   // The index of the quotation this token refers to
133 |   optional MapStringString conllUFeatures = 32;
134 |   optional string coarseTag         = 33; //  The coarse POS tag (used to store the UPOS tag)
135 |   optional Span conllUTokenSpan     = 34;
136 |   optional string conllUMisc        = 35;
137 |   optional MapIntString conllUSecondaryDeps = 36;
138 |   optional string   wikipediaEntity = 37;
139 | 
140 | 
141 |   // Fields set by other annotators in CoreNLP
142 |   optional string gender          = 51;  // gender annotation (machine reading)
143 |   optional string trueCase        = 52;  // true case type of token
144 |   optional string trueCaseText    = 53;  // true case gloss of token
145 | 
146 |   // Fields in the CoreLabel java class that are moved elsewhere
147 |   //       string text           @see Document#text + character offsets
148 |   //       uint32 sentenceIndex  @see Sentence#sentenceIndex
149 |   //       string docID          @see Document#docID
150 |   //       uint32 index          @see implicit in Sentence
151 |   //       uint32 paragraph      @see Sentence#paragraph
152 | 
153 |   extensions 100 to 255;
154 | }
155 | 
156 | //
157 | // An enumeration of valid sentiment values for the sentiment classifier.
158 | //
159 | enum Sentiment {
160 |   STRONG_NEGATIVE   = 0;
161 |   WEAK_NEGATIVE     = 1;
162 |   NEUTRAL           = 2;
163 |   WEAK_POSITIVE     = 3;
164 |   STRONG_POSITIVE   = 4;
165 | }
166 | 
167 | //
168 | // A quotation marker in text
169 | //
170 | message Quote {
171 |   optional string text           = 1;
172 |   optional uint32 begin          = 2;
173 |   optional uint32 end            = 3;
174 |   optional uint32 sentenceBegin  = 5;
175 |   optional uint32 sentenceEnd    = 6;
176 |   optional uint32 tokenBegin     = 7;
177 |   optional uint32 tokenEnd       = 8;
178 |   optional string docid          = 9;
179 |   optional uint32 index          = 10;
180 | }
181 | 
182 | //
183 | // A syntactic parse tree, with scores.
184 | //
185 | message ParseTree {
186 |   repeated ParseTree child           = 1;
187 |   optional string    value           = 2;
188 |   optional uint32    yieldBeginIndex = 3;
189 |   optional uint32    yieldEndIndex   = 4;
190 |   optional double    score           = 5;
191 |   optional Sentiment sentiment       = 6;
192 | }
193 | 
194 | //
195 | // A dependency graph representation.
196 | //
197 | message DependencyGraph {
198 |   message Node {
199 |     required uint32 sentenceIndex  = 1;
200 |     required uint32 index          = 2;
201 |     optional uint32 copyAnnotation = 3;
202 |   }
203 | 
204 |   message Edge {
205 |     required uint32 source     = 1;
206 |     required uint32 target     = 2;
207 |     optional string dep        = 3;
208 |     optional bool   isExtra    = 4;
209 |     optional uint32 sourceCopy = 5;
210 |     optional uint32 targetCopy = 6;
211 |     optional Language language = 7 [default=Unknown];
212 |   }
213 |   
214 |   repeated Node     node     = 1;
215 |   repeated Edge     edge     = 2;
216 |   repeated uint32   root     = 3 [packed=true];
217 | }
218 | 
219 | //
220 | // A coreference chain.
221 | // These fields are not *really* optional. CoreNLP will crash without them.
222 | //
223 | message CorefChain {
224 |   message CorefMention {
225 |     optional int32  mentionID          = 1;
226 |     optional string mentionType        = 2;
227 |     optional string number             = 3;
228 |     optional string gender             = 4;
229 |     optional string animacy            = 5;
230 |     optional uint32 beginIndex         = 6;
231 |     optional uint32 endIndex           = 7;
232 |     optional uint32 headIndex          = 9;
233 |     optional uint32 sentenceIndex      = 10;
234 |     optional uint32 position           = 11;  // the second element of position
235 |   }
236 | 
237 |   required int32        chainID        = 1;
238 |   repeated CorefMention mention        = 2;
239 |   required uint32       representative = 3;
240 | }
241 | 
242 | //
243 | // a mention
244 | //
245 | 
246 | message Mention {
247 |   optional int32 mentionID             = 1;
248 |   optional string mentionType          = 2;
249 |   optional string number               = 3;
250 |   optional string gender               = 4;
251 |   optional string animacy              = 5;
252 |   optional string person               = 6;
253 |   optional uint32 startIndex           = 7;
254 |   optional uint32 endIndex             = 9;
255 |   optional uint32 headIndex            = 10;
256 |   optional string headString           = 11;
257 |   optional string nerString            = 12;
258 |   optional uint32 originalRef          = 13;
259 |   optional int32 goldCorefClusterID    = 14;
260 |   optional int32 corefClusterID        = 15;
261 |   optional uint32 mentionNum           = 16;
262 |   optional uint32 sentNum              = 17;
263 |   optional uint32 utter                = 18;
264 |   optional uint32 paragraph            = 19;
265 |   optional bool isSubject              = 20;
266 |   optional bool isDirectObject         = 21;
267 |   optional bool isIndirectObject       = 22;
268 |   optional bool isPrepositionObject    = 23;
269 |   optional bool hasTwin                = 24;
270 |   optional bool generic                = 25;
271 |   optional bool isSingleton            = 26;
272 |   optional bool hasBasicDependency     = 27;
273 |   optional bool hasEnhancedDepenedncy  = 28;
274 |   optional bool hasContextParseTree    = 29;
275 |   optional IndexedWord headIndexedWord = 30;
276 |   optional IndexedWord   dependingVerb = 31;
277 |   optional IndexedWord       headWord  = 32;
278 |   optional SpeakerInfo    speakerInfo  = 33;
279 | 
280 |   repeated IndexedWord         sentenceWords = 50;
281 |   repeated IndexedWord         originalSpan = 51;
282 |   repeated string dependents           = 52;
283 |   repeated string preprocessedTerms    = 53;
284 |   repeated int32 appositions = 54;
285 |   repeated int32 predicateNominatives = 55;
286 |   repeated int32 relativePronouns = 56;
287 |   repeated int32 listMembers = 57;
288 |   repeated int32 belongToLists = 58;
289 | 
290 | }
291 | 
292 | //
293 | // store the position (sentence, token index) of a CoreLabel
294 | //
295 | 
296 | message IndexedWord {
297 |   optional uint32 sentenceNum          = 1;
298 |   optional uint32 tokenIndex           = 2;
299 |   optional uint32 docID                = 3;
300 |   optional uint32 copyCount            = 4;
301 | }
302 | 
303 | //
304 | // speaker info, this is used for Mentions
305 | //
306 | 
307 | message SpeakerInfo {
308 |   optional string speakerName          = 1;
309 |   repeated int32 mentions    = 2;
310 | }
311 | 
312 | //
313 | // A Span of text
314 | //
315 | message Span {
316 |   required uint32 begin      = 1;
317 |   required uint32 end        = 2;
318 | }
319 | 
320 | //
321 | // A Timex object, representing a temporal expression (TIMe EXpression)
322 | // These fields are not *really* optional. CoreNLP will crash without them.
323 | //
324 | message Timex {
325 |   optional string value      = 1;
326 |   optional string altValue   = 2;
327 |   optional string text       = 3;
328 |   optional string type       = 4;
329 |   optional string tid        = 5;
330 |   optional uint32 beginPoint = 6;
331 |   optional uint32 endPoint   = 7;
332 | }
333 | 
334 | //
335 | // A representation of an entity in a relation.
336 | // This corresponds to the EntityMention, and more broadly the
337 | // ExtractionObject classes.
338 | //
339 | message Entity {
340 |   optional uint32 headStart      = 6;
341 |   optional uint32 headEnd        = 7;
342 |   optional string mentionType    = 8;
343 |   optional string normalizedName = 9;
344 |   optional uint32 headTokenIndex = 10;
345 |   optional string corefID        = 11;
346 |   // inherited from ExtractionObject
347 |   optional string objectID = 1;
348 |   optional uint32 extentStart    = 2;
349 |   optional uint32 extentEnd      = 3;
350 |   optional string type           = 4;
351 |   optional string subtype        = 5;
352 |   // Implicit
353 |   //       uint32 sentence       @see implicit in sentence
354 | }
355 | 
356 | //
357 | // A representation of a relation, mirroring RelationMention
358 | //
359 | message Relation {
360 |   repeated string argName   = 6;
361 |   repeated Entity arg       = 7;
362 |   optional string signature = 8;
363 |   // inherited from ExtractionObject
364 |   optional string objectID = 1;
365 |   optional uint32 extentStart    = 2;
366 |   optional uint32 extentEnd      = 3;
367 |   optional string type           = 4;
368 |   optional string subtype        = 5;
369 |   // Implicit
370 |   //       uint32 sentence       @see implicit in sentence
371 | }
372 | 
373 | //
374 | // A Natural Logic operator
375 | //
376 | message Operator {
377 |   required string name                = 1;
378 |   required int32  quantifierSpanBegin = 2;
379 |   required int32  quantifierSpanEnd   = 3;
380 |   required int32  subjectSpanBegin    = 4;
381 |   required int32  subjectSpanEnd      = 5;
382 |   required int32  objectSpanBegin     = 6;
383 |   required int32  objectSpanEnd       = 7;
384 | }
385 | 
386 | //
387 | // The seven informative Natural Logic relations
388 | //
389 | enum NaturalLogicRelation {
390 |   EQUIVALENCE        = 0;
391 |   FORWARD_ENTAILMENT = 1;
392 |   REVERSE_ENTAILMENT = 2;
393 |   NEGATION           = 3;
394 |   ALTERNATION        = 4;
395 |   COVER              = 5;
396 |   INDEPENDENCE       = 6;
397 | }
398 | 
399 | //
400 | // The polarity of a word, according to Natural Logic
401 | //
402 | message Polarity {
403 |   required NaturalLogicRelation projectEquivalence       = 1;
404 |   required NaturalLogicRelation projectForwardEntailment = 2;
405 |   required NaturalLogicRelation projectReverseEntailment = 3;
406 |   required NaturalLogicRelation projectNegation          = 4;
407 |   required NaturalLogicRelation projectAlternation       = 5;
408 |   required NaturalLogicRelation projectCover             = 6;
409 |   required NaturalLogicRelation projectIndependence      = 7;
410 | }
411 | 
412 | //
413 | // An NER mention in the text
414 | //
415 | message NERMention {
416 |   optional uint32 sentenceIndex                 = 1;
417 |   required uint32 tokenStartInSentenceInclusive = 2;
418 |   required uint32 tokenEndInSentenceExclusive   = 3;
419 |   required string ner                           = 4;
420 |   optional string normalizedNER                 = 5;
421 |   optional string entityType                    = 6;
422 |   optional Timex  timex                         = 7;
423 |   optional string wikipediaEntity               = 8;
424 | }
425 | 
426 | //
427 | // An entailed sentence fragment.
428 | // Created by the openie annotator.
429 | //
430 | message SentenceFragment {
431 |   repeated uint32 tokenIndex     = 1;
432 |   optional uint32 root           = 2;
433 |   optional bool   assumedTruth   = 3;
434 |   optional double score          = 4;
435 | }
436 | 
437 | 
438 | //
439 | // The index of a token in a document, including the sentence
440 | // index and the offset.
441 | //
442 | message TokenLocation {
443 |  optional uint32 sentenceIndex = 1;
444 |  optional uint32 tokenIndex    = 2;
445 | 
446 | }
447 | 
448 | 
449 | //
450 | // An OpenIE relation triple.
451 | // Created by the openie annotator.
452 | //
453 | message RelationTriple {
454 |   optional string          subject        = 1;   // The surface form of the subject
455 |   optional string          relation       = 2;   // The surface form of the relation (required)
456 |   optional string          object         = 3;   // The surface form of the object
457 |   optional double          confidence     = 4;   // The [optional] confidence of the extraction
458 |   repeated TokenLocation   subjectTokens  = 13; // The tokens comprising the subject of the triple
459 |   repeated TokenLocation   relationTokens = 14; // The tokens comprising the relation of the triple
460 |   repeated TokenLocation   objectTokens   = 15; // The tokens comprising the object of the triple
461 |   optional DependencyGraph tree           = 8;   // The dependency graph fragment for this triple
462 |   optional bool            istmod         = 9;   // If true, this expresses an implicit tmod relation
463 |   optional bool            prefixBe       = 10;  // If true, this relation string is missing a 'be' prefix
464 |   optional bool            suffixBe       = 11;  // If true, this relation string is missing a 'be' suffix
465 |   optional bool            suffixOf       = 12;  // If true, this relation string is missing a 'of' prefix
466 | }
467 | 
468 | 
469 | //
470 | // A map from strings to strings.
471 | // Used, minimally, in the CoNLLU featurizer
472 | //
473 | message MapStringString {
474 |   repeated string key   = 1;
475 |   repeated string value = 2;
476 | }
477 | 
478 | //
479 | // A map from integers to strings.
480 | // Used, minimally, in the CoNLLU featurizer
481 | //
482 | message MapIntString {
483 |   repeated uint32 key   = 1;
484 |   repeated string value = 2;
485 | }
486 | 
487 | 


--------------------------------------------------------------------------------