├── setup.cfg ├── corenlp_pywrap ├── __init__.py ├── test_using_pytest.py ├── example.py └── pywrap.py ├── setup.py ├── LICENCE.txt ├── .gitignore └── README.md /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /corenlp_pywrap/__init__.py: -------------------------------------------------------------------------------- 1 | from corenlp_pywrap.pywrap import CoreNLP -------------------------------------------------------------------------------- /corenlp_pywrap/test_using_pytest.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | myPath = os.path.dirname(os.path.abspath(__file__)) 3 | sys.path.insert(0, myPath) 4 | 5 | import pywrap as p 6 | 7 | class Test_common(): 8 | cn = p.CoreNLP 9 | def test_localurl(self): 10 | assert '127.0.0.1' in self.cn.url\ 11 | or 'localhost' in self.cn.url,\ 12 | 'script is pointing to cloud' 13 | assert not self.url.endswith('/'), 'cannot ends with /' 14 | def test_no_sentiments(self): 15 | assert 'SENTIMENT' not in map( 16 | str.upper, self.cn.annotator_full_list),\ 17 | 'Sentiment is not supported' 18 | def annot_len(self): 19 | assert len(self.annotator_list) == 14 20 | 21 | class Test_serverconnection(): 22 | sc_obj = p.CoreNLP() 23 | cur_url = sc_obj.url_calc() 24 | r = sc_obj.server_connection(cur_url, 'This is Sherin') 25 | r = r.json() 26 | 27 | def test_return_type(self): 28 | assert isinstance(self.r, dict), 'arrange function should return a'\ 29 | 'requests object which is convertable to dictionary' 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | setup( 4 | name = "corenlp_pywrap", 5 | version = "1.0.5", 6 | author = "hhsecond", 7 | author_email = "sherinct@live.com", 8 | description = ("A powerful python wraper for Stanford CoreNLP"), 9 | license = "MIT", 10 | keywords = "stanford corenlp wrapper", 11 | install_requires=['requests'], 12 | url = "https://www.github.com/hhsecond/corenlp_pywrap", 13 | download_url = "https://www.github.com/hhsecond/corenlp_pywrap/tarball/1.0.5", 14 | packages=['corenlp_pywrap'], 15 | long_description='Production Ready version equiped with basic'\ 16 | 'output fetch of stanfornlp and custom arrange function, '\ 17 | 'for more Info - '\ 18 | 'CheckItOut: https://github.com/hhsecond/corenlp_pywrap', 19 | classifiers=[ 20 | "Development Status :: 5 - Production/Stable", 21 | "Intended Audience :: Science/Research", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | "License :: OSI Approved :: MIT License", 24 | ], 25 | ) -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Sherin Thomas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /corenlp_pywrap/example.py: -------------------------------------------------------------------------------- 1 | import pywrap as p 2 | import logging 3 | p.root.setLevel(logging.DEBUG) 4 | cn = p.CoreNLP() 5 | sent = ''' .cb-list-heading{font-weight:normal!important} MOBILE SITE & APPSm.cricbuzz.comAndroidiOSWindows MobileBlackberryChrome ExtensionFOLLOW US ONfacebooktwittergoogle+PinterestRSS FeedCOMPANYCareersAdvertisePrivacy PolicyTerms of UseCricbuzz TV Ads© 2016 Cricbuzz.com, Times Internet Limited. All rights reserved | The Times of India | Navbharat Timesvar script_tag = document.getElementsByTagName('script')[0]; (function() { var cmin = document.createElement('script'); cmin.type = 'text/javascript'; cmin.async = true; cmin.src = 'http://i.cricketcb.com/statics/site/js/cricbuzz.min.1466404921.js'; script_tag.parentNode.insertBefore(cmin, script_tag); })();(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-PGNCT7'); 0''' 6 | r = cn.arrange(sent) 7 | print(len(r['index'])) 8 | print(len(r['word'])) 9 | print(r['normalizedNER']) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # corenlp_pywrap 1.0.5 2 | ###### Powerfull python wrapper for Stanford CoreNLP project 3 | 4 | [![GitHub release](https://img.shields.io/badge/release-1.0.5-green.svg?maxAge=2592000)](https://github.com/hhsecond/corenlp_pywrap/releases) [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/hhsecond/corenlp_pywrap/blob/master/LICENCE.txt) [![PyPI](https://img.shields.io/pypi/wheel/Django.svg?maxAge=2592000)](https://pypi.python.org/pypi/corenlp_pywrap) 5 | 6 | 7 | ## CoreNLP v3.6.0 8 | - Update your version (both CoreNLP and corenlp_pywrap) for bug fixes and more features 9 | - Works only with python 3.x 10 | 11 | 12 | ### Pywrap Doc 13 | 14 | #### Install 15 | >pip install corenlp_pywrap 16 | 17 | or 18 | 19 | >pip3 install corenlp_pywrap 20 | 21 | #### Usage 22 | ```python 23 | from corenlp_pywrap import pywrap 24 | cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=full_annotator_list) 25 | #full_annotator_list = ["tokenize", "cleanxml", "ssplit", "pos", "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", "relation", "natlog", "quote"] 26 | 27 | #Calling basic function which would return a 'requests' object 28 | out = cn.basic(data, out_format='json') 29 | ``` 30 | Remember 'out' would be 'requests' object, you can get information by using out.text or out.json() 31 | 32 | Pywrap does not inherently support 'Sentiment' now as the downloadable server version of CoreNLP doesn't have 'Sentiment' support. But there is a hack for you to use (if you are sure that your server version is the newest one and has the support) 33 | - You can give sentiment as an attribute to annotator_list while instantiating the class object 34 | - or 35 | ``` 36 | annotator_list = CoreNLP.full_annotator_list + ['sentiment'] 37 | ``` 38 | 39 | #### Custom Function 40 | - arrange() can be used for getting formatted output 41 | - Format is given below 42 | ```python 43 | token_dict = { 44 | 'index':[], 45 | 'truecaseText':[], 46 | 'ner':[], 47 | 'before':[], 48 | 'originalText':[], 49 | 'characterOffsetBegin':[], 50 | 'lemma':[], 51 | 'truecase':[], 52 | 'pos':[], 53 | 'characterOffsetEnd':[], 54 | 'speaker':[], 55 | 'word':[], 56 | 'after':[] 57 | } 58 | ``` 59 | - arrange() would return token_dict which is in the above format. 60 | - arrange() would work only for 'sentences' now. More features like 'enhanceddependancies' are coming in future releases 61 | - Usage: 62 | ```python 63 | from corenlp_pywrap import pywrap 64 | cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=full_annotator_list) 65 | #full_annotator_list = ["tokenize", "cleanxml", "ssplit", "pos", "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", "relation", "natlog", "quote"] 66 | 67 | #custom function 68 | token_dict = cn.arrange(data) 69 | 70 | #example out: token_dict['index'] would give you something like this - [1,2,3,4] 71 | ``` 72 | 73 | 74 | #### Server Instantiation Error 75 | - If you don't have the CoreNLP server downloaded, please download the server [here](http://stanfordnlp.github.io/CoreNLP/download.html) 76 | - Make sure you have Jave 8+ version installed 77 | - CD to the downloaded folder 78 | - Follow below commands 79 | ``` 80 | # Run the server using all jars in the current directory (e.g., the CoreNLP home directory) 81 | # port and timeout are optional 82 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer [port] [timeout] 83 | ``` 84 | - Verify the server instance in the browser 85 | 86 | > http://localhost:port/ 87 | 88 | replace 'port' with the port number you have given. if you didn't give any port number, port would be 9000 89 | 90 | > http://localhost:9000/ 91 | 92 | 93 | 94 | #### Debugging & Logging 95 | - Pywrap using logging module for logging and debugging. 96 | - Default logging level is set to 'warning' 97 | - If you need more verbose logs for debugging or logging purpose make changes to the logging values 98 | 99 | - Default log facilities 100 | ```python 101 | root = logging.getLogger('Root') 102 | root.setLevel(logging.WARNING) 103 | 104 | lhandler = logging.StreamHandler(sys.stdout) 105 | formatter = logging.Formatter( 106 | '%(asctime)s [%(name)s]:%(levelname)s - %(message)s', 107 | '%Y-%m-%d %H:%M:%S') 108 | lhandler.setFormatter(formatter) 109 | root.addHandler(lhandler) 110 | ``` 111 | 112 | - You can modify each of them just like below given example 113 | ```python 114 | import corenlp_pywrap as cp 115 | import logging 116 | cp.pywrap.root.setLevel(logging.DEBUG) 117 | ``` 118 | -------------------------------------------------------------------------------- /corenlp_pywrap/pywrap.py: -------------------------------------------------------------------------------- 1 | import requests, logging, sys 2 | 3 | root = logging.getLogger('Root') 4 | root.setLevel(logging.WARNING) 5 | 6 | lhandler = logging.StreamHandler(sys.stdout) 7 | formatter = logging.Formatter( 8 | '%(asctime)s [%(levelname)s] : %(message)s', 9 | '%Y-%m-%d %H:%M:%S') 10 | lhandler.setFormatter(formatter) 11 | root.addHandler(lhandler) 12 | 13 | class CoreNLP: 14 | root.debug('Object instantiating..') 15 | annotator_full_list = ["tokenize", "cleanxml", "ssplit", "pos", 16 | "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", 17 | "relation", "natlog", "quote"] 18 | url = 'http://127.0.0.1:9000' 19 | out_format = 'json' 20 | 21 | def __init__(self, url=url, annotator_list=annotator_full_list): 22 | assert url.upper().startswith('HTTP'), \ 23 | 'url string should be prefixed with http' 24 | if 'SENTIMENT' in map(str.upper, annotator_list): 25 | root.warning('You are using "Sentiment" annotator which is'\ 26 | 'not supported by Old version of CoreNLP') 27 | 28 | if url.endswith('/'): 29 | self.url = url[:-1] 30 | else: 31 | self.url = url 32 | 33 | assert isinstance(annotator_list, list), \ 34 | 'annotators can be passed only as a python list' 35 | if len(annotator_list) == 14: 36 | root.info('Using all the annotators, It might take a while') 37 | 38 | self.annotator_list = annotator_list 39 | 40 | common=set(self.annotator_list).intersection(self.annotator_full_list) 41 | not_suprtd_elem = set(self.annotator_list) - common 42 | assertion_error = 'annotator not supported: ' + str(not_suprtd_elem) 43 | assert not not_suprtd_elem, assertion_error 44 | 45 | 46 | @staticmethod 47 | def server_connection(current_url, data): 48 | root.debug('server connection: ' + current_url) 49 | try: 50 | server_out = requests.post(current_url, 51 | data, 52 | headers={'Connection': 'close'}) 53 | except requests.exceptions.ConnectionError: 54 | root.error('Connection Error, check you have server running') 55 | raise Exception('Check your CoreNLP Server status \n' 56 | 'if not sure, Check the pywrap doc for Server instantiation') 57 | return server_out 58 | 59 | 60 | def url_calc(self, serializer=''): 61 | s_string = '/?properties={"annotators": "' 62 | anot_string = ','.join(self.annotator_list) 63 | m_string = '", "outputFormat": "' + self.out_format 64 | f_string = '", "serializer": "' + serializer + '"}' 65 | return self.url + s_string + anot_string + m_string + f_string 66 | 67 | 68 | def basic(self, data, out_format='json', serializer=''): 69 | self.out_format = out_format 70 | format_list = ['JSON', 'XML', 'TEXT', 'SERIALIZED'] 71 | assert out_format.upper() in format_list, \ 72 | 'output format not supported, check stanford doc' 73 | 74 | if out_format.upper() == 'SERIALIZED' and not serializer: 75 | root.info( 76 | 'Default Serializer is using - ' + 77 | 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer') 78 | serializer = ('edu.stanford.nlp.pipeline.' 79 | 'ProtobufAnnotationSerializer') 80 | 81 | current_url = self.url_calc(serializer) 82 | assert isinstance(data, str) and data, 'Enter valid string input' 83 | 84 | return self.server_connection(current_url, data) 85 | 86 | @staticmethod 87 | def tokensregex(data, pattern='', custom_filter=''): 88 | root.info('TokenRegex started') 89 | return CoreNLP.regex('/tokensregex', data, pattern, custom_filter) 90 | 91 | @staticmethod 92 | def semgrex(data, pattern='', custom_filter=''): 93 | root.info('SemRegex started') 94 | return CoreNLP.regex('/semgrex', data, pattern, custom_filter) 95 | 96 | @staticmethod 97 | def tregex(data, pattern='', custom_filter=''): 98 | root.info('Tregex started') 99 | return CoreNLP.regex('/tregex', data, pattern, custom_filter) 100 | 101 | @classmethod 102 | def regex(cls, endpoint, data, pattern, custom_filter): 103 | url_string = '/?pattern=' + str(pattern) +'&filter=' + custom_filter 104 | current_url = cls.url + endpoint + url_string 105 | root.info('Returning the data requested') 106 | return cls.server_connection(current_url, data) 107 | 108 | @staticmethod 109 | def process_sentences(sentences): 110 | assert isinstance(sentences, list), 'it should be a list' 111 | index = 0 112 | new_index = 0 113 | token_dict = { 114 | 'index':[], 115 | 'truecaseText':[], 116 | 'ner':[], 117 | 'before':[], 118 | 'originalText':[], 119 | 'characterOffsetBegin':[], 120 | 'lemma':[], 121 | 'truecase':[], 122 | 'pos':[], 123 | 'characterOffsetEnd':[], 124 | 'speaker':[], 125 | 'word':[], 126 | 'after':[], 127 | 'normalizedNER':[] 128 | } 129 | for sentence in sentences: 130 | index = new_index 131 | tokens = sentence['tokens'] 132 | for val in tokens: 133 | 134 | #workaround to handle length inconsistancie with normalizedNER, rethink the logic 135 | if 'ner' in val.keys() and 'normalizedNER' not in val.keys(): 136 | token_dict['normalizedNER'].append(0) 137 | 138 | for key, val in val.items(): 139 | if key == 'index': 140 | new_index = index + int(val) 141 | token_dict[key].append(str(new_index)) 142 | else: 143 | try: 144 | token_dict[key].append(val) 145 | except KeyError: 146 | token_dict[key] = [val] 147 | root.info('New key added: ' + key) 148 | return token_dict 149 | 150 | 151 | def arrange(self, data): 152 | root.info('Executing custom function') 153 | assert isinstance(data, str) and data, 'Enter valid string input' 154 | if 'lemma' not in self.annotator_list: 155 | self.annotator_list.append('lemma') 156 | 157 | current_url = self.url_calc() 158 | r = self.server_connection(current_url, data) 159 | try: 160 | r = r.json() 161 | rs = r['sentences'] 162 | except ValueError: 163 | root.error('Value Error: '+r.text+', Check special chars in input') 164 | rs = [] 165 | return self.process_sentences(rs) 166 | --------------------------------------------------------------------------------