├── .gitignore ├── LICENSE ├── README.md ├── setup.py ├── stanfordcorenlp ├── __init__.py └── corenlp.py ├── test.py ├── unit_test.py └── wiki └── coref.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .idea/ 104 | 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Lynten 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## stanfordcorenlp 2 | [![PyPI](https://img.shields.io/pypi/v/stanfordcorenlp.svg)]() 3 | [![GitHub release](https://img.shields.io/github/release/Lynten/stanford-corenlp.svg)]() 4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stanfordcorenlp.svg)]() 5 | 6 | 7 | `stanfordcorenlp` is a Python wrapper for [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/). It provides a simple API for text processing tasks such as Tokenization, Part of Speech Tagging, Named Entity Reconigtion, Constituency Parsing, Dependency Parsing, and more. 8 | 9 | ## Prerequisites 10 | Java 1.8+ (Check with command: `java -version`) ([Download Page](http://www.oracle.com/technetwork/cn/java/javase/downloads/jdk8-downloads-2133151-zhs.html)) 11 | 12 | Stanford CoreNLP ([Download Page](https://stanfordnlp.github.io/CoreNLP/history.html)) 13 | 14 | | Py Version | CoreNLP Version | 15 | | --- | --- | 16 | |v3.7.0.1 v3.7.0.2 | CoreNLP 3.7.0 | 17 | |v3.8.0.1 | CoreNLP 3.8.0 | 18 | |v3.9.1.1 | CoreNLP 3.9.1 | 19 | 20 | ## Installation 21 | 22 | `pip install stanfordcorenlp` 23 | 24 | ## Example 25 | ### Simple Usage 26 | ```python 27 | # Simple usage 28 | from stanfordcorenlp import StanfordCoreNLP 29 | 30 | nlp = StanfordCoreNLP(r'G:\JavaLibraries\stanford-corenlp-full-2018-02-27') 31 | 32 | sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.' 33 | print 'Tokenize:', nlp.word_tokenize(sentence) 34 | print 'Part of Speech:', nlp.pos_tag(sentence) 35 | print 'Named Entities:', nlp.ner(sentence) 36 | print 'Constituency Parsing:', nlp.parse(sentence) 37 | print 'Dependency Parsing:', nlp.dependency_parse(sentence) 38 | 39 | nlp.close() # Do not forget to close! The backend server will consume a lot memery. 40 | ``` 41 | 42 | Output format: 43 | ```python 44 | # Tokenize 45 | [u'Guangdong', u'University', u'of', u'Foreign', u'Studies', u'is', u'located', u'in', u'Guangzhou', u'.'] 46 | 47 | # Part of Speech 48 | [(u'Guangdong', u'NNP'), (u'University', u'NNP'), (u'of', u'IN'), (u'Foreign', u'NNP'), (u'Studies', u'NNPS'), (u'is', u'VBZ'), (u'located', u'JJ'), (u'in', u'IN'), (u'Guangzhou', u'NNP'), (u'.', u'.')] 49 | 50 | # Named Entities 51 | [(u'Guangdong', u'ORGANIZATION'), (u'University', u'ORGANIZATION'), (u'of', u'ORGANIZATION'), (u'Foreign', u'ORGANIZATION'), (u'Studies', u'ORGANIZATION'), (u'is', u'O'), (u'located', u'O'), (u'in', u'O'), (u'Guangzhou', u'LOCATION'), (u'.', u'O')] 52 | 53 | # Constituency Parsing 54 | (ROOT 55 | (S 56 | (NP 57 | (NP (NNP Guangdong) (NNP University)) 58 | (PP (IN of) 59 | (NP (NNP Foreign) (NNPS Studies)))) 60 | (VP (VBZ is) 61 | (ADJP (JJ located) 62 | (PP (IN in) 63 | (NP (NNP Guangzhou))))) 64 | (. .))) 65 | 66 | # Dependency Parsing 67 | [(u'ROOT', 0, 7), (u'compound', 2, 1), (u'nsubjpass', 7, 2), (u'case', 5, 3), (u'compound', 5, 4), (u'nmod', 2, 5), (u'auxpass', 7, 6), (u'case', 9, 8), (u'nmod', 7, 9), (u'punct', 7, 10)] 68 | 69 | ``` 70 | 71 | ### Other Human Languages Support 72 | Note: you must download an additional model file and place it in the `.../stanford-corenlp-full-2018-02-27` folder. For example, you should download the `stanford-chinese-corenlp-2018-02-27-models.jar` file if you want to process Chinese. 73 | ```python 74 | # _*_coding:utf-8_*_ 75 | 76 | # Other human languages support, e.g. Chinese 77 | sentence = '清华大学位于北京。' 78 | 79 | with StanfordCoreNLP(r'G:\JavaLibraries\stanford-corenlp-full-2018-02-27', lang='zh') as nlp: 80 | print(nlp.word_tokenize(sentence)) 81 | print(nlp.pos_tag(sentence)) 82 | print(nlp.ner(sentence)) 83 | print(nlp.parse(sentence)) 84 | print(nlp.dependency_parse(sentence)) 85 | ``` 86 | 87 | ### General Stanford CoreNLP API 88 | Since this will load all the models which require more memory, initialize the server with more memory. 8GB is recommended. 89 | 90 | ```python 91 | # General json output 92 | nlp = StanfordCoreNLP(r'path_to_corenlp', memory='8g') 93 | print nlp.annotate(sentence) 94 | nlp.close() 95 | ``` 96 | You can specify properties: 97 | 98 | - `annotators`: `tokenize, ssplit, pos, lemma, ner, parse, depparse, dcoref` ([See Detail](https://stanfordnlp.github.io/CoreNLP/annotators.html)) 99 | 100 | - `pipelineLanguage`: `en, zh, ar, fr, de, es` (English, Chinese, Arabic, French, German, Spanish) ([See Annotator Support Detail](https://stanfordnlp.github.io/CoreNLP/human-languages.html)) 101 | 102 | - `outputFormat`: `json, xml, text` 103 | ```python 104 | text = 'Guangdong University of Foreign Studies is located in Guangzhou. ' \ 105 | 'GDUFS is active in a full range of international cooperation and exchanges in education. ' 106 | 107 | props={'annotators': 'tokenize,ssplit,pos','pipelineLanguage':'en','outputFormat':'xml'} 108 | print nlp.annotate(text, properties=props) 109 | nlp.close() 110 | ``` 111 | 112 | 113 | ### Use an Existing Server 114 | Start a [CoreNLP Server](https://stanfordnlp.github.io/CoreNLP/corenlp-server.html) with command: 115 | ``` 116 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 117 | ``` 118 | And then: 119 | ```python 120 | # Use an existing server 121 | nlp = StanfordCoreNLP('http://localhost', port=9000) 122 | ``` 123 | 124 | ## Debug 125 | ```python 126 | import logging 127 | from stanfordcorenlp import StanfordCoreNLP 128 | 129 | # Debug the wrapper 130 | nlp = StanfordCoreNLP(r'path_or_host', logging_level=logging.DEBUG) 131 | 132 | # Check more info from the CoreNLP Server 133 | nlp = StanfordCoreNLP(r'path_or_host', quiet=False, logging_level=logging.DEBUG) 134 | nlp.close() 135 | ``` 136 | 137 | ## Build 138 | 139 | We use `setuptools` to package our project. You can build from the latest source code with the following command: 140 | ``` 141 | $ python setup.py bdist_wheel --universal 142 | ``` 143 | 144 | You will see the `.whl` file under `dist` directory. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='stanfordcorenlp', 5 | packages=['stanfordcorenlp'], 6 | version='3.9.1.1', 7 | description='Python wrapper for Stanford CoreNLP.', 8 | 9 | author='Lynten Guo', 10 | author_email='1216920263@qq.com', 11 | 12 | url='https://github.com/Lynten/stanford-corenlp', 13 | keywords=['NLP', 'CL', 'natural language processing', 14 | 'computational linguistics'], 15 | install_requires=['psutil', 'requests'], 16 | 17 | classifiers=[ 18 | 'Development Status :: 5 - Production/Stable', 19 | 'Intended Audience :: Developers', 20 | 'Intended Audience :: Education', 21 | 'Intended Audience :: Information Technology', 22 | 'Intended Audience :: Science/Research', 23 | 'License :: OSI Approved :: MIT License', 24 | 'Operating System :: OS Independent', 25 | 'Programming Language :: Python :: 2', 26 | 'Programming Language :: Python :: 2.7', 27 | 'Programming Language :: Python :: 3', 28 | 'Programming Language :: Python :: 3.4', 29 | 'Programming Language :: Python :: 3.5', 30 | 'Programming Language :: Python :: 3.6', 31 | 'Topic :: Scientific/Engineering', 32 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 33 | 'Topic :: Scientific/Engineering :: Information Analysis', 34 | 'Topic :: Text Processing', 35 | 'Topic :: Text Processing :: Linguistic', 36 | ], 37 | 38 | license="MIT License", 39 | 40 | ) 41 | -------------------------------------------------------------------------------- /stanfordcorenlp/__init__.py: -------------------------------------------------------------------------------- 1 | from stanfordcorenlp.corenlp import StanfordCoreNLP 2 | -------------------------------------------------------------------------------- /stanfordcorenlp/corenlp.py: -------------------------------------------------------------------------------- 1 | # _*_coding:utf-8_*_ 2 | from __future__ import print_function 3 | 4 | import glob 5 | import json 6 | import logging 7 | import os 8 | import re 9 | import socket 10 | import subprocess 11 | import sys 12 | import time 13 | 14 | import psutil 15 | 16 | try: 17 | from urlparse import urlparse 18 | except ImportError: 19 | from urllib.parse import urlparse 20 | 21 | import requests 22 | 23 | 24 | class StanfordCoreNLP: 25 | def __init__(self, path_or_host, port=None, memory='4g', lang='en', timeout=1500, quiet=True, 26 | logging_level=logging.WARNING, max_retries=5): 27 | self.path_or_host = path_or_host 28 | self.port = port 29 | self.memory = memory 30 | self.lang = lang 31 | self.timeout = timeout 32 | self.quiet = quiet 33 | self.logging_level = logging_level 34 | 35 | logging.basicConfig(level=self.logging_level) 36 | 37 | # Check args 38 | self._check_args() 39 | 40 | if path_or_host.startswith('http'): 41 | self.url = path_or_host + ':' + str(port) 42 | logging.info('Using an existing server {}'.format(self.url)) 43 | else: 44 | 45 | # Check Java 46 | if not subprocess.call(['java', '-version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) == 0: 47 | raise RuntimeError('Java not found.') 48 | 49 | # Check if the dir exists 50 | if not os.path.isdir(self.path_or_host): 51 | raise IOError(str(self.path_or_host) + ' is not a directory.') 52 | directory = os.path.normpath(self.path_or_host) + os.sep 53 | self.class_path_dir = directory 54 | 55 | # Check if the language specific model file exists 56 | switcher = { 57 | 'en': 'stanford-corenlp-[0-9].[0-9].[0-9]-models.jar', 58 | 'zh': 'stanford-chinese-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar', 59 | 'ar': 'stanford-arabic-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar', 60 | 'fr': 'stanford-french-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar', 61 | 'de': 'stanford-german-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar', 62 | 'es': 'stanford-spanish-corenlp-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-models.jar' 63 | } 64 | jars = { 65 | 'en': 'stanford-corenlp-x.x.x-models.jar', 66 | 'zh': 'stanford-chinese-corenlp-yyyy-MM-dd-models.jar', 67 | 'ar': 'stanford-arabic-corenlp-yyyy-MM-dd-models.jar', 68 | 'fr': 'stanford-french-corenlp-yyyy-MM-dd-models.jar', 69 | 'de': 'stanford-german-corenlp-yyyy-MM-dd-models.jar', 70 | 'es': 'stanford-spanish-corenlp-yyyy-MM-dd-models.jar' 71 | } 72 | if len(glob.glob(directory + switcher.get(self.lang))) <= 0: 73 | raise IOError(jars.get( 74 | self.lang) + ' not exists. You should download and place it in the ' + directory + ' first.') 75 | 76 | # If port not set, auto select 77 | if self.port is None: 78 | for port_candidate in range(9000, 65535): 79 | if port_candidate not in [conn.laddr[1] for conn in psutil.net_connections()]: 80 | self.port = port_candidate 81 | break 82 | 83 | # Check if the port is in use 84 | if self.port in [conn.laddr[1] for conn in psutil.net_connections()]: 85 | raise IOError('Port ' + str(self.port) + ' is already in use.') 86 | 87 | # Start native server 88 | logging.info('Initializing native server...') 89 | cmd = "java" 90 | java_args = "-Xmx{}".format(self.memory) 91 | java_class = "edu.stanford.nlp.pipeline.StanfordCoreNLPServer" 92 | class_path = '"{}*"'.format(directory) 93 | 94 | args = [cmd, java_args, '-cp', class_path, java_class, '-port', str(self.port)] 95 | 96 | args = ' '.join(args) 97 | 98 | logging.info(args) 99 | 100 | # Silence 101 | with open(os.devnull, 'w') as null_file: 102 | out_file = None 103 | if self.quiet: 104 | out_file = null_file 105 | 106 | self.p = subprocess.Popen(args, shell=True, stdout=out_file, stderr=subprocess.STDOUT) 107 | logging.info('Server shell PID: {}'.format(self.p.pid)) 108 | 109 | self.url = 'http://localhost:' + str(self.port) 110 | 111 | # Wait until server starts 112 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 113 | host_name = urlparse(self.url).hostname 114 | time.sleep(1) # OSX, not tested 115 | trial = 1 116 | while sock.connect_ex((host_name, self.port)): 117 | if trial > max_retries: 118 | raise ValueError('Corenlp server is not available') 119 | logging.info('Waiting until the server is available.') 120 | trial += 1 121 | time.sleep(1) 122 | logging.info('The server is available.') 123 | 124 | def __enter__(self): 125 | return self 126 | 127 | def __exit__(self, exc_type, exc_val, exc_tb): 128 | self.close() 129 | 130 | def close(self): 131 | logging.info('Cleanup...') 132 | if hasattr(self, 'p'): 133 | try: 134 | parent = psutil.Process(self.p.pid) 135 | except psutil.NoSuchProcess: 136 | logging.info('No process: {}'.format(self.p.pid)) 137 | return 138 | 139 | if self.class_path_dir not in ' '.join(parent.cmdline()): 140 | logging.info('Process not in: {}'.format(parent.cmdline())) 141 | return 142 | 143 | children = parent.children(recursive=True) 144 | for process in children: 145 | logging.info('Killing pid: {}, cmdline: {}'.format(process.pid, process.cmdline())) 146 | # process.send_signal(signal.SIGTERM) 147 | process.kill() 148 | 149 | logging.info('Killing shell pid: {}, cmdline: {}'.format(parent.pid, parent.cmdline())) 150 | # parent.send_signal(signal.SIGTERM) 151 | parent.kill() 152 | 153 | def annotate(self, text, properties=None): 154 | if sys.version_info.major >= 3: 155 | text = text.encode('utf-8') 156 | 157 | r = requests.post(self.url, params={'properties': str(properties)}, data=text, 158 | headers={'Connection': 'close'}) 159 | return r.text 160 | 161 | def tregex(self, sentence, pattern): 162 | tregex_url = self.url + '/tregex' 163 | r_dict = self._request(tregex_url, "tokenize,ssplit,depparse,parse", sentence, pattern=pattern) 164 | return r_dict 165 | 166 | def tokensregex(self, sentence, pattern): 167 | tokensregex_url = self.url + '/tokensregex' 168 | r_dict = self._request(tokensregex_url, "tokenize,ssplit,depparse", sentence, pattern=pattern) 169 | return r_dict 170 | 171 | def semgrex(self, sentence, pattern): 172 | semgrex_url = self.url + '/semgrex' 173 | r_dict = self._request(semgrex_url, "tokenize,ssplit,depparse", sentence, pattern=pattern) 174 | return r_dict 175 | 176 | def word_tokenize(self, sentence, span=False): 177 | r_dict = self._request('ssplit,tokenize', sentence) 178 | tokens = [token['originalText'] for s in r_dict['sentences'] for token in s['tokens']] 179 | 180 | # Whether return token span 181 | if span: 182 | spans = [(token['characterOffsetBegin'], token['characterOffsetEnd']) for s in r_dict['sentences'] for token 183 | in s['tokens']] 184 | return tokens, spans 185 | else: 186 | return tokens 187 | 188 | def pos_tag(self, sentence): 189 | r_dict = self._request(self.url, 'pos', sentence) 190 | words = [] 191 | tags = [] 192 | for s in r_dict['sentences']: 193 | for token in s['tokens']: 194 | words.append(token['originalText']) 195 | tags.append(token['pos']) 196 | return list(zip(words, tags)) 197 | 198 | def ner(self, sentence): 199 | r_dict = self._request(self.url, 'ner', sentence) 200 | words = [] 201 | ner_tags = [] 202 | for s in r_dict['sentences']: 203 | for token in s['tokens']: 204 | words.append(token['originalText']) 205 | ner_tags.append(token['ner']) 206 | return list(zip(words, ner_tags)) 207 | 208 | def parse(self, sentence): 209 | r_dict = self._request(self.url, 'pos,parse', sentence) 210 | return [s['parse'] for s in r_dict['sentences']][0] 211 | 212 | def dependency_parse(self, sentence): 213 | r_dict = self._request(self.url, 'depparse', sentence) 214 | return [(dep['dep'], dep['governor'], dep['dependent']) for s in r_dict['sentences'] for dep in 215 | s['basicDependencies']] 216 | 217 | def coref(self, text): 218 | r_dict = self._request('coref', text) 219 | 220 | corefs = [] 221 | for k, mentions in r_dict['corefs'].items(): 222 | simplified_mentions = [] 223 | for m in mentions: 224 | simplified_mentions.append((m['sentNum'], m['startIndex'], m['endIndex'], m['text'])) 225 | corefs.append(simplified_mentions) 226 | return corefs 227 | 228 | def switch_language(self, language="en"): 229 | self._check_language(language) 230 | self.lang = language 231 | 232 | def _request(self, url, annotators=None, data=None, *args, **kwargs): 233 | if sys.version_info.major >= 3: 234 | data = data.encode('utf-8') 235 | 236 | properties = {'annotators': annotators, 'outputFormat': 'json'} 237 | params = {'properties': str(properties), 'pipelineLanguage': self.lang} 238 | if 'pattern' in kwargs: 239 | params = {"pattern": kwargs['pattern'], 'properties': str(properties), 'pipelineLanguage': self.lang} 240 | 241 | logging.info(params) 242 | r = requests.post(url, params=params, data=data, headers={'Connection': 'close'}) 243 | r_dict = json.loads(r.text) 244 | 245 | return r_dict 246 | 247 | def _check_args(self): 248 | self._check_language(self.lang) 249 | if not re.match('\dg', self.memory): 250 | raise ValueError('memory=' + self.memory + ' not supported. Use 4g, 6g, 8g and etc. ') 251 | 252 | def _check_language(self, lang): 253 | if lang not in ['en', 'zh', 'ar', 'fr', 'de', 'es']: 254 | raise ValueError('lang=' + self.lang + ' not supported. Use English(en), Chinese(zh), Arabic(ar), ' 255 | 'French(fr), German(de), Spanish(es).') 256 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # _*_coding:utf-8_*_ 2 | 3 | from __future__ import print_function 4 | 5 | import logging 6 | 7 | from stanfordcorenlp import StanfordCoreNLP 8 | 9 | # local_corenlp_path = r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/' 10 | # local_corenlp_path = r'G:\JavaLibraries\stanford-corenlp-full-2017-06-09' 11 | local_corenlp_path = r'G:\JavaLibraries\stanford-corenlp-full-2018-01-31' 12 | # local_corenlp_path = r'/home/gld/JavaLibs/stanford-corenlp-full-2016-10-31' 13 | 14 | # Simple usage 15 | nlp = StanfordCoreNLP(local_corenlp_path, quiet=False, logging_level=logging.DEBUG) 16 | 17 | sentence = 'Guangdong University of Foreign Studies (GDUFS) is located in Guangzhou.' 18 | print('Tokenize:', nlp.word_tokenize(sentence)) 19 | print('Part of Speech:', nlp.pos_tag(sentence)) 20 | print('Named Entities:', nlp.ner(sentence)) 21 | print('Constituency Parsing:', nlp.parse(sentence)) 22 | print('Dependency Parsing:', nlp.dependency_parse(sentence)) 23 | 24 | nlp.close() 25 | 26 | # Other human languages support, e.g. Chinese 27 | sentence = '清华大学位于北京。' 28 | 29 | with StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False) as nlp: 30 | print(nlp.word_tokenize(sentence)) 31 | print(nlp.pos_tag(sentence)) 32 | print(nlp.ner(sentence)) 33 | print(nlp.parse(sentence)) 34 | print(nlp.dependency_parse(sentence)) 35 | 36 | # General Stanford CoreNLP API 37 | nlp = StanfordCoreNLP(local_corenlp_path, memory='8g', lang='zh') 38 | print(nlp.annotate(sentence)) 39 | nlp.close() 40 | 41 | nlp = StanfordCoreNLP(local_corenlp_path) 42 | text = 'Guangdong University of Foreign Studies is located in Guangzhou. ' \ 43 | 'GDUFS is active in a full range of international cooperation and exchanges in education. ' 44 | pros = {'annotators': 'tokenize,ssplit,pos', 'pinelineLanguage': 'en', 'outputFormat': 'xml'} 45 | print(nlp.annotate(text, properties=pros)) 46 | nlp.close() 47 | 48 | # Use an existing server 49 | nlp = StanfordCoreNLP('http://corenlp.run', port=80) 50 | -------------------------------------------------------------------------------- /unit_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from stanfordcorenlp import StanfordCoreNLP 4 | 5 | 6 | class MyTestCase(unittest.TestCase): 7 | def test_args(self): 8 | self.assertRaises(IOError, StanfordCoreNLP, '/abc') 9 | self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/', 10 | lang='abc') 11 | self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/', 12 | memory='4m') 13 | 14 | 15 | if __name__ == '__main__': 16 | unittest.main() 17 | -------------------------------------------------------------------------------- /wiki/coref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lynten/stanford-corenlp/dec81f51b72469877512c78abc45fd2581bd1237/wiki/coref.png --------------------------------------------------------------------------------