├── .gitignore
├── LICENSE
├── README.md
├── setup.py
├── stanfordcorenlp
    ├── __init__.py
    └── corenlp.py
├── test.py
├── unit_test.py
└── wiki
    └── coref.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .idea/
104 | 
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Lynten
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## stanfordcorenlp
  2 | [![PyPI](https://img.shields.io/pypi/v/stanfordcorenlp.svg)]()
  3 | [![GitHub release](https://img.shields.io/github/release/Lynten/stanford-corenlp.svg)]()
  4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stanfordcorenlp.svg)]()
  5 | 
  6 | 
  7 | `stanfordcorenlp` is a Python wrapper for [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/). It provides a simple API for text processing tasks such as Tokenization, Part of Speech Tagging, Named Entity Reconigtion, Constituency Parsing, Dependency Parsing, and more.
  8 | 
  9 | ## Prerequisites
 10 | Java 1.8+ (Check with command: `java -version`) ([Download Page](http://www.oracle.com/technetwork/cn/java/javase/downloads/jdk8-downloads-2133151-zhs.html))
 11 | 
 12 | Stanford CoreNLP ([Download Page](https://stanfordnlp.github.io/CoreNLP/history.html))
 13 | 
 14 | | Py Version | CoreNLP Version |
 15 | | --- | --- |
 16 | |v3.7.0.1 v3.7.0.2 | CoreNLP 3.7.0 |
 17 | |v3.8.0.1 | CoreNLP 3.8.0 |
 18 | |v3.9.1.1 | CoreNLP 3.9.1 |
 19 | 
 20 | ## Installation
 21 | 
 22 | `pip install stanfordcorenlp`
 23 | 
 24 | ## Example
 25 | ### Simple Usage
 26 | ```python
 27 | # Simple usage
 28 | from stanfordcorenlp import StanfordCoreNLP
 29 | 
 30 | nlp = StanfordCoreNLP(r'G:\JavaLibraries\stanford-corenlp-full-2018-02-27')
 31 | 
 32 | sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
 33 | print 'Tokenize:', nlp.word_tokenize(sentence)
 34 | print 'Part of Speech:', nlp.pos_tag(sentence)
 35 | print 'Named Entities:', nlp.ner(sentence)
 36 | print 'Constituency Parsing:', nlp.parse(sentence)
 37 | print 'Dependency Parsing:', nlp.dependency_parse(sentence)
 38 | 
 39 | nlp.close() # Do not forget to close! The backend server will consume a lot memery.
 40 | ```
 41 | 
 42 | Output format:
 43 | ```python
 44 | # Tokenize
 45 | [u'Guangdong', u'University', u'of', u'Foreign', u'Studies', u'is', u'located', u'in', u'Guangzhou', u'.']
 46 | 
 47 | # Part of Speech
 48 | [(u'Guangdong', u'NNP'), (u'University', u'NNP'), (u'of', u'IN'), (u'Foreign', u'NNP'), (u'Studies', u'NNPS'), (u'is', u'VBZ'), (u'located', u'JJ'), (u'in', u'IN'), (u'Guangzhou', u'NNP'), (u'.', u'.')]
 49 | 
 50 | # Named Entities
 51 |  [(u'Guangdong', u'ORGANIZATION'), (u'University', u'ORGANIZATION'), (u'of', u'ORGANIZATION'), (u'Foreign', u'ORGANIZATION'), (u'Studies', u'ORGANIZATION'), (u'is', u'O'), (u'located', u'O'), (u'in', u'O'), (u'Guangzhou', u'LOCATION'), (u'.', u'O')]
 52 | 
 53 | # Constituency Parsing
 54 |  (ROOT
 55 |   (S
 56 |     (NP
 57 |       (NP (NNP Guangdong) (NNP University))
 58 |       (PP (IN of)
 59 |         (NP (NNP Foreign) (NNPS Studies))))
 60 |     (VP (VBZ is)
 61 |       (ADJP (JJ located)
 62 |         (PP (IN in)
 63 |           (NP (NNP Guangzhou)))))
 64 |     (. .)))
 65 | 
 66 | # Dependency Parsing
 67 | [(u'ROOT', 0, 7), (u'compound', 2, 1), (u'nsubjpass', 7, 2), (u'case', 5, 3), (u'compound', 5, 4), (u'nmod', 2, 5), (u'auxpass', 7, 6), (u'case', 9, 8), (u'nmod', 7, 9), (u'punct', 7, 10)]
 68 | 
 69 | ```
 70 | 
 71 | ### Other Human Languages Support
 72 | Note: you must download an additional model file and place it in the `.../stanford-corenlp-full-2018-02-27` folder. For example, you should download the `stanford-chinese-corenlp-2018-02-27-models.jar` file if you want to process Chinese.
 73 | ```python
 74 | # _*_coding:utf-8_*_
 75 | 
 76 | # Other human languages support, e.g. Chinese
 77 | sentence = '清华大学位于北京。'
 78 | 
 79 | with StanfordCoreNLP(r'G:\JavaLibraries\stanford-corenlp-full-2018-02-27', lang='zh') as nlp:
 80 |     print(nlp.word_tokenize(sentence))
 81 |     print(nlp.pos_tag(sentence))
 82 |     print(nlp.ner(sentence))
 83 |     print(nlp.parse(sentence))
 84 |     print(nlp.dependency_parse(sentence))
 85 | ```
 86 | 
 87 | ### General Stanford CoreNLP API
 88 | Since this will load all the models which require more memory, initialize the server with more memory. 8GB is recommended.
 89 | 
 90 | ```python
 91 |  # General json output
 92 | nlp = StanfordCoreNLP(r'path_to_corenlp', memory='8g')
 93 | print nlp.annotate(sentence)
 94 | nlp.close()
 95 | ```
 96 | You can specify properties:
 97 | 
 98 | - `annotators`: `tokenize, ssplit, pos, lemma, ner, parse, depparse, dcoref` ([See Detail](https://stanfordnlp.github.io/CoreNLP/annotators.html))
 99 | 
100 | - `pipelineLanguage`: `en, zh, ar, fr, de, es` (English, Chinese, Arabic, French, German, Spanish) ([See Annotator Support Detail](https://stanfordnlp.github.io/CoreNLP/human-languages.html)) 
101 | 
102 | - `outputFormat`: `json, xml, text`
103 | ```python
104 | text = 'Guangdong University of Foreign Studies is located in Guangzhou. ' \
105 |        'GDUFS is active in a full range of international cooperation and exchanges in education. '
106 | 
107 | props={'annotators': 'tokenize,ssplit,pos','pipelineLanguage':'en','outputFormat':'xml'}
108 | print nlp.annotate(text, properties=props)
109 | nlp.close()
110 | ```
111 | 
112 | 
113 | ### Use an Existing Server
114 | Start a [CoreNLP Server](https://stanfordnlp.github.io/CoreNLP/corenlp-server.html) with command:
115 | ```
116 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
117 | ```
118 | And then:
119 | ```python
120 | # Use an existing server
121 | nlp = StanfordCoreNLP('http://localhost', port=9000)
122 | ```
123 | 
124 | ## Debug
125 | ```python
126 | import logging
127 | from stanfordcorenlp import StanfordCoreNLP
128 | 
129 | # Debug the wrapper
130 | nlp = StanfordCoreNLP(r'path_or_host', logging_level=logging.DEBUG)
131 | 
132 | # Check more info from the CoreNLP Server 
133 | nlp = StanfordCoreNLP(r'path_or_host', quiet=False, logging_level=logging.DEBUG)
134 | nlp.close()
135 | ```
136 | 
137 | ## Build
138 | 
139 | We use `setuptools` to package our project. You can build from the latest source code with the following command:
140 | ```
141 | $ python setup.py bdist_wheel --universal
142 | ```
143 | 
144 | You will see the `.whl` file under `dist` directory.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='stanfordcorenlp',
 5 |     packages=['stanfordcorenlp'],
 6 |     version='3.9.1.1',
 7 |     description='Python wrapper for Stanford CoreNLP.',
 8 | 
 9 |     author='Lynten Guo',
10 |     author_email='1216920263@qq.com',
11 | 
12 |     url='https://github.com/Lynten/stanford-corenlp',
13 |     keywords=['NLP', 'CL', 'natural language processing',
14 |               'computational linguistics'],
15 |     install_requires=['psutil', 'requests'],
16 | 
17 |     classifiers=[
18 |         'Development Status :: 5 - Production/Stable',
19 |         'Intended Audience :: Developers',
20 |         'Intended Audience :: Education',
21 |         'Intended Audience :: Information Technology',
22 |         'Intended Audience :: Science/Research',
23 |         'License :: OSI Approved :: MIT License',
24 |         'Operating System :: OS Independent',
25 |         'Programming Language :: Python :: 2',
26 |         'Programming Language :: Python :: 2.7',
27 |         'Programming Language :: Python :: 3',
28 |         'Programming Language :: Python :: 3.4',
29 |         'Programming Language :: Python :: 3.5',
30 |         'Programming Language :: Python :: 3.6',
31 |         'Topic :: Scientific/Engineering',
32 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
33 |         'Topic :: Scientific/Engineering :: Information Analysis',
34 |         'Topic :: Text Processing',
35 |         'Topic :: Text Processing :: Linguistic',
36 |     ],
37 | 
38 |     license="MIT License",
39 | 
40 | )
41 | 


--------------------------------------------------------------------------------
/stanfordcorenlp/__init__.py:
--------------------------------------------------------------------------------
1 | from stanfordcorenlp.corenlp import StanfordCoreNLP
2 | 


--------------------------------------------------------------------------------
/stanfordcorenlp/corenlp.py:
--------------------------------------------------------------------------------
  1 | # _*_coding:utf-8_*_
  2 | from __future__ import print_function
  3 | 
  4 | import glob
  5 | import json
  6 | import logging
  7 | import os
  8 | import re
  9 | import socket
 10 | import subprocess
 11 | import sys
 12 | import time
 13 | 
 14 | import psutil
 15 | 
 16 | try:
 17 |     from urlparse import urlparse
 18 | except ImportError:
 19 |     from urllib.parse import urlparse
 20 | 
 21 | import requests
 22 | 
 23 | 
 24 | class StanfordCoreNLP:
 25 |     def __init__(self, path_or_host, port=None, memory='4g', lang='en', timeout=1500, quiet=True,
 26 |                  logging_level=logging.WARNING, max_retries=5):
 27 |         self.path_or_host = path_or_host
 28 |         self.port = port
 29 |         self.memory = memory
 30 |         self.lang = lang
 31 |         self.timeout = timeout
 32 |         self.quiet = quiet
 33 |         self.logging_level = logging_level
 34 | 
 35 |         logging.basicConfig(level=self.logging_level)
 36 | 
 37 |         # Check args
 38 |         self._check_args()
 39 | 
 40 |         if path_or_host.startswith('http'):
 41 |             self.url = path_or_host + ':' + str(port)
 42 |             logging.info('Using an existing server {}'.format(self.url))
 43 |         else:
 44 | 
 45 |             # Check Java
 46 |             if not subprocess.call(['java', '-version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) == 0:
 47 |                 raise RuntimeError('Java not found.')
 48 | 
 49 |             # Check if the dir exists
 50 |             if not os.path.isdir(self.path_or_host):
 51 |                 raise IOError(str(self.path_or_host) + ' is not a directory.')
 52 |             directory = os.path.normpath(self.path_or_host) + os.sep
 53 |             self.class_path_dir = directory
 54 | 
 55 |             # Check if the language specific model file exists
 56 |             switcher = {
 57 |                 'en': 'stanford-corenlp-[0-9].[0-9].[0-9]-models.jar',
 58 |                 'zh': 'stanford-chinese-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar',
 59 |                 'ar': 'stanford-arabic-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar',
 60 |                 'fr': 'stanford-french-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar',
 61 |                 'de': 'stanford-german-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar',
 62 |                 'es': 'stanford-spanish-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar'
 63 |             }
 64 |             jars = {
 65 |                 'en': 'stanford-corenlp-x.x.x-models.jar',
 66 |                 'zh': 'stanford-chinese-corenlp-yyyy-MM-dd-models.jar',
 67 |                 'ar': 'stanford-arabic-corenlp-yyyy-MM-dd-models.jar',
 68 |                 'fr': 'stanford-french-corenlp-yyyy-MM-dd-models.jar',
 69 |                 'de': 'stanford-german-corenlp-yyyy-MM-dd-models.jar',
 70 |                 'es': 'stanford-spanish-corenlp-yyyy-MM-dd-models.jar'
 71 |             }
 72 |             if len(glob.glob(directory + switcher.get(self.lang))) <= 0:
 73 |                 raise IOError(jars.get(
 74 |                     self.lang) + ' not exists. You should download and place it in the ' + directory + ' first.')
 75 | 
 76 |             # If port not set, auto select
 77 |             if self.port is None:
 78 |                 for port_candidate in range(9000, 65535):
 79 |                     if port_candidate not in [conn.laddr[1] for conn in psutil.net_connections()]:
 80 |                         self.port = port_candidate
 81 |                         break
 82 | 
 83 |             # Check if the port is in use
 84 |             if self.port in [conn.laddr[1] for conn in psutil.net_connections()]:
 85 |                 raise IOError('Port ' + str(self.port) + ' is already in use.')
 86 | 
 87 |             # Start native server
 88 |             logging.info('Initializing native server...')
 89 |             cmd = "java"
 90 |             java_args = "-Xmx{}".format(self.memory)
 91 |             java_class = "edu.stanford.nlp.pipeline.StanfordCoreNLPServer"
 92 |             class_path = '"{}*"'.format(directory)
 93 | 
 94 |             args = [cmd, java_args, '-cp', class_path, java_class, '-port', str(self.port)]
 95 | 
 96 |             args = ' '.join(args)
 97 | 
 98 |             logging.info(args)
 99 | 
100 |             # Silence
101 |             with open(os.devnull, 'w') as null_file:
102 |                 out_file = None
103 |                 if self.quiet:
104 |                     out_file = null_file
105 | 
106 |                 self.p = subprocess.Popen(args, shell=True, stdout=out_file, stderr=subprocess.STDOUT)
107 |                 logging.info('Server shell PID: {}'.format(self.p.pid))
108 | 
109 |             self.url = 'http://localhost:' + str(self.port)
110 | 
111 |         # Wait until server starts
112 |         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
113 |         host_name = urlparse(self.url).hostname
114 |         time.sleep(1)  # OSX, not tested
115 |         trial = 1
116 |         while sock.connect_ex((host_name, self.port)):
117 |             if trial > max_retries:
118 |                 raise ValueError('Corenlp server is not available')
119 |             logging.info('Waiting until the server is available.')
120 |             trial += 1
121 |             time.sleep(1)
122 |         logging.info('The server is available.')
123 | 
124 |     def __enter__(self):
125 |         return self
126 | 
127 |     def __exit__(self, exc_type, exc_val, exc_tb):
128 |         self.close()
129 | 
130 |     def close(self):
131 |         logging.info('Cleanup...')
132 |         if hasattr(self, 'p'):
133 |             try:
134 |                 parent = psutil.Process(self.p.pid)
135 |             except psutil.NoSuchProcess:
136 |                 logging.info('No process: {}'.format(self.p.pid))
137 |                 return
138 | 
139 |             if self.class_path_dir not in ' '.join(parent.cmdline()):
140 |                 logging.info('Process not in: {}'.format(parent.cmdline()))
141 |                 return
142 | 
143 |             children = parent.children(recursive=True)
144 |             for process in children:
145 |                 logging.info('Killing pid: {}, cmdline: {}'.format(process.pid, process.cmdline()))
146 |                 # process.send_signal(signal.SIGTERM)
147 |                 process.kill()
148 | 
149 |             logging.info('Killing shell pid: {}, cmdline: {}'.format(parent.pid, parent.cmdline()))
150 |             # parent.send_signal(signal.SIGTERM)
151 |             parent.kill()
152 | 
153 |     def annotate(self, text, properties=None):
154 |         if sys.version_info.major >= 3:
155 |             text = text.encode('utf-8')
156 | 
157 |         r = requests.post(self.url, params={'properties': str(properties)}, data=text,
158 |                           headers={'Connection': 'close'})
159 |         return r.text
160 | 
161 |     def tregex(self, sentence, pattern):
162 |         tregex_url = self.url + '/tregex'
163 |         r_dict = self._request(tregex_url, "tokenize,ssplit,depparse,parse", sentence, pattern=pattern)
164 |         return r_dict
165 | 
166 |     def tokensregex(self, sentence, pattern):
167 |         tokensregex_url = self.url + '/tokensregex'
168 |         r_dict = self._request(tokensregex_url, "tokenize,ssplit,depparse", sentence, pattern=pattern)
169 |         return r_dict
170 | 
171 |     def semgrex(self, sentence, pattern):
172 |         semgrex_url = self.url + '/semgrex'
173 |         r_dict = self._request(semgrex_url, "tokenize,ssplit,depparse", sentence, pattern=pattern)
174 |         return r_dict
175 | 
176 |     def word_tokenize(self, sentence, span=False):
177 |         r_dict = self._request('ssplit,tokenize', sentence)
178 |         tokens = [token['originalText'] for s in r_dict['sentences'] for token in s['tokens']]
179 | 
180 |         # Whether return token span
181 |         if span:
182 |             spans = [(token['characterOffsetBegin'], token['characterOffsetEnd']) for s in r_dict['sentences'] for token
183 |                      in s['tokens']]
184 |             return tokens, spans
185 |         else:
186 |             return tokens
187 | 
188 |     def pos_tag(self, sentence):
189 |         r_dict = self._request(self.url, 'pos', sentence)
190 |         words = []
191 |         tags = []
192 |         for s in r_dict['sentences']:
193 |             for token in s['tokens']:
194 |                 words.append(token['originalText'])
195 |                 tags.append(token['pos'])
196 |         return list(zip(words, tags))
197 | 
198 |     def ner(self, sentence):
199 |         r_dict = self._request(self.url, 'ner', sentence)
200 |         words = []
201 |         ner_tags = []
202 |         for s in r_dict['sentences']:
203 |             for token in s['tokens']:
204 |                 words.append(token['originalText'])
205 |                 ner_tags.append(token['ner'])
206 |         return list(zip(words, ner_tags))
207 | 
208 |     def parse(self, sentence):
209 |         r_dict = self._request(self.url, 'pos,parse', sentence)
210 |         return [s['parse'] for s in r_dict['sentences']][0]
211 | 
212 |     def dependency_parse(self, sentence):
213 |         r_dict = self._request(self.url, 'depparse', sentence)
214 |         return [(dep['dep'], dep['governor'], dep['dependent']) for s in r_dict['sentences'] for dep in
215 |                 s['basicDependencies']]
216 | 
217 |     def coref(self, text):
218 |         r_dict = self._request('coref', text)
219 | 
220 |         corefs = []
221 |         for k, mentions in r_dict['corefs'].items():
222 |             simplified_mentions = []
223 |             for m in mentions:
224 |                 simplified_mentions.append((m['sentNum'], m['startIndex'], m['endIndex'], m['text']))
225 |             corefs.append(simplified_mentions)
226 |         return corefs
227 | 
228 |     def switch_language(self, language="en"):
229 |         self._check_language(language)
230 |         self.lang = language
231 | 
232 |     def _request(self, url, annotators=None, data=None, *args, **kwargs):
233 |         if sys.version_info.major >= 3:
234 |             data = data.encode('utf-8')
235 | 
236 |         properties = {'annotators': annotators, 'outputFormat': 'json'}
237 |         params = {'properties': str(properties), 'pipelineLanguage': self.lang}
238 |         if 'pattern' in kwargs:
239 |             params = {"pattern": kwargs['pattern'], 'properties': str(properties), 'pipelineLanguage': self.lang}
240 | 
241 |         logging.info(params)
242 |         r = requests.post(url, params=params, data=data, headers={'Connection': 'close'})
243 |         r_dict = json.loads(r.text)
244 | 
245 |         return r_dict
246 | 
247 |     def _check_args(self):
248 |         self._check_language(self.lang)
249 |         if not re.match('\dg', self.memory):
250 |             raise ValueError('memory=' + self.memory + ' not supported. Use 4g, 6g, 8g and etc. ')
251 | 
252 |     def _check_language(self, lang):
253 |         if lang not in ['en', 'zh', 'ar', 'fr', 'de', 'es']:
254 |             raise ValueError('lang=' + self.lang + ' not supported. Use English(en), Chinese(zh), Arabic(ar), '
255 |                                                    'French(fr), German(de), Spanish(es).')
256 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # _*_coding:utf-8_*_
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import logging
 6 | 
 7 | from stanfordcorenlp import StanfordCoreNLP
 8 | 
 9 | # local_corenlp_path = r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/'
10 | # local_corenlp_path = r'G:\JavaLibraries\stanford-corenlp-full-2017-06-09'
11 | local_corenlp_path = r'G:\JavaLibraries\stanford-corenlp-full-2018-01-31'
12 | # local_corenlp_path = r'/home/gld/JavaLibs/stanford-corenlp-full-2016-10-31'
13 | 
14 | # Simple usage
15 | nlp = StanfordCoreNLP(local_corenlp_path, quiet=False, logging_level=logging.DEBUG)
16 | 
17 | sentence = 'Guangdong University of Foreign Studies (GDUFS) is located in Guangzhou.'
18 | print('Tokenize:', nlp.word_tokenize(sentence))
19 | print('Part of Speech:', nlp.pos_tag(sentence))
20 | print('Named Entities:', nlp.ner(sentence))
21 | print('Constituency Parsing:', nlp.parse(sentence))
22 | print('Dependency Parsing:', nlp.dependency_parse(sentence))
23 | 
24 | nlp.close()
25 | 
26 | # Other human languages support, e.g. Chinese
27 | sentence = '清华大学位于北京。'
28 | 
29 | with StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False) as nlp:
30 |     print(nlp.word_tokenize(sentence))
31 |     print(nlp.pos_tag(sentence))
32 |     print(nlp.ner(sentence))
33 |     print(nlp.parse(sentence))
34 |     print(nlp.dependency_parse(sentence))
35 | 
36 | # General Stanford CoreNLP API
37 | nlp = StanfordCoreNLP(local_corenlp_path, memory='8g', lang='zh')
38 | print(nlp.annotate(sentence))
39 | nlp.close()
40 | 
41 | nlp = StanfordCoreNLP(local_corenlp_path)
42 | text = 'Guangdong University of Foreign Studies is located in Guangzhou. ' \
43 |        'GDUFS is active in a full range of international cooperation and exchanges in education. '
44 | pros = {'annotators': 'tokenize,ssplit,pos', 'pinelineLanguage': 'en', 'outputFormat': 'xml'}
45 | print(nlp.annotate(text, properties=pros))
46 | nlp.close()
47 | 
48 | # Use an existing server
49 | nlp = StanfordCoreNLP('http://corenlp.run', port=80)
50 | 


--------------------------------------------------------------------------------
/unit_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from stanfordcorenlp import StanfordCoreNLP
 4 | 
 5 | 
 6 | class MyTestCase(unittest.TestCase):
 7 |     def test_args(self):
 8 |         self.assertRaises(IOError, StanfordCoreNLP, '/abc')
 9 |         self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/',
10 |                           lang='abc')
11 |         self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/',
12 |                           memory='4m')
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     unittest.main()
17 | 


--------------------------------------------------------------------------------
/wiki/coref.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lynten/stanford-corenlp/dec81f51b72469877512c78abc45fd2581bd1237/wiki/coref.png


--------------------------------------------------------------------------------