├── setup.cfg
├── corenlp_pywrap
    ├── __init__.py
    ├── test_using_pytest.py
    ├── example.py
    └── pywrap.py
├── setup.py
├── LICENCE.txt
├── .gitignore
└── README.md


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/corenlp_pywrap/__init__.py:
--------------------------------------------------------------------------------
1 | from corenlp_pywrap.pywrap import CoreNLP


--------------------------------------------------------------------------------
/corenlp_pywrap/test_using_pytest.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | myPath = os.path.dirname(os.path.abspath(__file__))
 3 | sys.path.insert(0, myPath)
 4 | 
 5 | import pywrap as p
 6 | 
 7 | class Test_common():
 8 | 	cn = p.CoreNLP
 9 | 	def test_localurl(self):
10 | 		assert '127.0.0.1' in self.cn.url\
11 | 			or 'localhost' in self.cn.url,\
12 | 			'script is pointing to cloud'
13 | 		assert not self.url.endswith('/'), 'cannot ends with /'
14 | 	def test_no_sentiments(self):
15 | 		assert 'SENTIMENT' not in map(
16 | 			str.upper, self.cn.annotator_full_list),\
17 | 			'Sentiment is not supported'
18 | 	def annot_len(self):
19 | 		assert len(self.annotator_list) == 14
20 | 		
21 | class Test_serverconnection():
22 | 	sc_obj = p.CoreNLP()
23 | 	cur_url = sc_obj.url_calc()
24 | 	r = sc_obj.server_connection(cur_url, 'This is Sherin')
25 | 	r = r.json()
26 | 	
27 | 	def test_return_type(self):
28 | 		assert isinstance(self.r, dict), 'arrange function should return a'\
29 | 		 'requests object which is convertable to dictionary'
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | setup(
 4 |     name = "corenlp_pywrap",
 5 |     version = "1.0.5",
 6 |     author = "hhsecond",
 7 |     author_email = "sherinct@live.com",
 8 |     description = ("A powerful python wraper for Stanford CoreNLP"),
 9 |     license = "MIT",
10 |     keywords = "stanford corenlp wrapper",
11 |     install_requires=['requests'],
12 |     url = "https://www.github.com/hhsecond/corenlp_pywrap",
13 |     download_url = "https://www.github.com/hhsecond/corenlp_pywrap/tarball/1.0.5",
14 |     packages=['corenlp_pywrap'],
15 |     long_description='Production Ready version equiped with basic'\
16 |         'output fetch of stanfornlp and custom arrange function, '\
17 |         'for more Info - '\
18 |         'CheckItOut: https://github.com/hhsecond/corenlp_pywrap',
19 |     classifiers=[
20 |         "Development Status :: 5 - Production/Stable",
21 |         "Intended Audience :: Science/Research",
22 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
23 |         "License :: OSI Approved :: MIT License",
24 |     ],
25 | )


--------------------------------------------------------------------------------
/LICENCE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Sherin Thomas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/corenlp_pywrap/example.py:
--------------------------------------------------------------------------------
1 | import pywrap as p
2 | import logging
3 | p.root.setLevel(logging.DEBUG)
4 | cn = p.CoreNLP()
5 | sent = ''' .cb-list-heading{font-weight:normal!important} MOBILE SITE & APPSm.cricbuzz.comAndroidiOSWindows MobileBlackberryChrome ExtensionFOLLOW US ONfacebooktwittergoogle+PinterestRSS FeedCOMPANYCareersAdvertisePrivacy PolicyTerms of UseCricbuzz TV Ads© 2016 Cricbuzz.com, Times Internet Limited. All rights reserved | The Times of India | Navbharat Timesvar script_tag = document.getElementsByTagName('script')[0];	(function() {	var cmin = document.createElement('script'); cmin.type = 'text/javascript'; cmin.async = true;	cmin.src = 'http://i.cricketcb.com/statics/site/js/cricbuzz.min.1466404921.js';	script_tag.parentNode.insertBefore(cmin, script_tag);	})();(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':	new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],	j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=	'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);	})(window,document,'script','dataLayer','GTM-PGNCT7'); 0'''
6 | r = cn.arrange(sent)
7 | print(len(r['index']))
8 | print(len(r['word']))
9 | print(r['normalizedNER'])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # corenlp_pywrap 1.0.5
  2 | ###### Powerfull python wrapper for Stanford CoreNLP project
  3 | 
  4 | [![GitHub release](https://img.shields.io/badge/release-1.0.5-green.svg?maxAge=2592000)](https://github.com/hhsecond/corenlp_pywrap/releases) [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/hhsecond/corenlp_pywrap/blob/master/LICENCE.txt) [![PyPI](https://img.shields.io/pypi/wheel/Django.svg?maxAge=2592000)](https://pypi.python.org/pypi/corenlp_pywrap)
  5 | 
  6 | 
  7 | ## CoreNLP v3.6.0
  8 | - Update your version (both CoreNLP and corenlp_pywrap) for bug fixes and more features
  9 | - Works only with python 3.x
 10 | 
 11 | 
 12 | ### Pywrap Doc
 13 |     
 14 | #### Install
 15 | >pip install corenlp_pywrap
 16 | 
 17 | or
 18 | 
 19 | >pip3 install corenlp_pywrap
 20 | 
 21 | #### Usage
 22 | ```python
 23 | from corenlp_pywrap import pywrap
 24 | cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=full_annotator_list)
 25 | #full_annotator_list = ["tokenize", "cleanxml", "ssplit", "pos", "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", "relation", "natlog", "quote"]
 26 | 
 27 | #Calling basic function which would return a 'requests' object
 28 | out = cn.basic(data, out_format='json')
 29 | ```
 30 | Remember 'out' would be 'requests' object, you can get information by using out.text or out.json()
 31 | 
 32 | Pywrap does not inherently support 'Sentiment' now as the downloadable server version of CoreNLP doesn't have 'Sentiment' support. But there is a hack for you to use (if you are sure that your server version is the newest one and has the support)
 33 | - You can give sentiment as an attribute to annotator_list while instantiating the class object
 34 | - or
 35 | ```
 36 | annotator_list = CoreNLP.full_annotator_list + ['sentiment']
 37 | ```
 38 |     
 39 | #### Custom Function
 40 | - arrange() can be used for getting formatted output
 41 | - Format is given below
 42 | ```python
 43 | token_dict = {
 44 | 'index':[],
 45 | 'truecaseText':[],
 46 | 'ner':[],
 47 | 'before':[],
 48 | 'originalText':[],
 49 | 'characterOffsetBegin':[],
 50 | 'lemma':[],
 51 | 'truecase':[],
 52 | 'pos':[],
 53 | 'characterOffsetEnd':[],
 54 | 'speaker':[],
 55 | 'word':[],
 56 | 'after':[]
 57 | }
 58 | ```
 59 | - arrange() would return token_dict which is in the above format.
 60 | - arrange() would work only for 'sentences' now. More features like 'enhanceddependancies' are coming in future releases
 61 | - Usage:
 62 | ```python
 63 | from corenlp_pywrap import pywrap
 64 | cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=full_annotator_list)
 65 | #full_annotator_list = ["tokenize", "cleanxml", "ssplit", "pos", "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", "relation", "natlog", "quote"]
 66 | 
 67 | #custom function
 68 | token_dict = cn.arrange(data)
 69 | 
 70 | #example out: token_dict['index'] would give you something like this - [1,2,3,4]
 71 | ```
 72 | 
 73 |     
 74 | #### Server Instantiation Error
 75 | - If you don't have the CoreNLP server downloaded, please download the server [here](http://stanfordnlp.github.io/CoreNLP/download.html)
 76 | - Make sure you have Jave 8+ version installed
 77 | - CD to the downloaded folder
 78 | - Follow below commands
 79 | ```
 80 | # Run the server using all jars in the current directory (e.g., the CoreNLP home directory)
 81 | # port and timeout are optional
 82 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer [port] [timeout]
 83 | ```
 84 | - Verify the server instance in the browser
 85 | 
 86 | > http://localhost:port/
 87 | 
 88 | replace 'port' with the port number you have given. if you didn't give any port number, port would be 9000
 89 | 
 90 | > http://localhost:9000/
 91 | 
 92 | 
 93 |     
 94 | #### Debugging & Logging
 95 | - Pywrap using logging module for logging and debugging.
 96 | - Default logging level is set to 'warning'
 97 | - If you need more verbose logs for debugging or logging purpose make changes to the logging values
 98 | 
 99 | - Default log facilities
100 | ```python
101 | root = logging.getLogger('Root')
102 | root.setLevel(logging.WARNING)
103 | 
104 | lhandler = logging.StreamHandler(sys.stdout)
105 | formatter = logging.Formatter(
106 |                 '%(asctime)s [%(name)s]:%(levelname)s - %(message)s',
107 |                 '%Y-%m-%d %H:%M:%S')
108 | lhandler.setFormatter(formatter)
109 | root.addHandler(lhandler)
110 | ```
111 | 
112 | - You can modify each of them just like below given example
113 | ```python
114 | import corenlp_pywrap as cp
115 | import logging
116 | cp.pywrap.root.setLevel(logging.DEBUG)
117 | ```
118 | 


--------------------------------------------------------------------------------
/corenlp_pywrap/pywrap.py:
--------------------------------------------------------------------------------
  1 | import requests, logging, sys
  2 | 
  3 | root = logging.getLogger('Root')
  4 | root.setLevel(logging.WARNING)
  5 | 
  6 | lhandler = logging.StreamHandler(sys.stdout)
  7 | formatter = logging.Formatter(
  8 |                 '%(asctime)s [%(levelname)s] : %(message)s',
  9 |                 '%Y-%m-%d %H:%M:%S')
 10 | lhandler.setFormatter(formatter)
 11 | root.addHandler(lhandler)
 12 | 
 13 | class CoreNLP:
 14 |     root.debug('Object instantiating..')
 15 |     annotator_full_list = ["tokenize", "cleanxml", "ssplit", "pos", 
 16 |     "lemma", "ner", "regexner", "truecase", "parse", "depparse", "dcoref", 
 17 |     "relation", "natlog", "quote"]
 18 |     url = 'http://127.0.0.1:9000'
 19 |     out_format = 'json'
 20 | 
 21 |     def __init__(self, url=url, annotator_list=annotator_full_list):        
 22 |         assert url.upper().startswith('HTTP'), \
 23 |             'url string should be prefixed with http'
 24 |         if 'SENTIMENT' in map(str.upper, annotator_list):
 25 |             root.warning('You are using "Sentiment" annotator which is'\
 26 |                 'not supported by Old version of CoreNLP')
 27 |             
 28 |         if url.endswith('/'):
 29 |             self.url = url[:-1]
 30 |         else:
 31 |             self.url = url
 32 | 
 33 |         assert isinstance(annotator_list, list), \
 34 |             'annotators can be passed only as a python list'
 35 |         if len(annotator_list) == 14:
 36 |             root.info('Using all the annotators, It might take a while')
 37 | 
 38 |         self.annotator_list = annotator_list
 39 |         
 40 |         common=set(self.annotator_list).intersection(self.annotator_full_list)
 41 |         not_suprtd_elem = set(self.annotator_list) - common
 42 |         assertion_error = 'annotator not supported: ' + str(not_suprtd_elem)
 43 |         assert not not_suprtd_elem, assertion_error
 44 | 
 45 | 
 46 |     @staticmethod
 47 |     def server_connection(current_url, data):
 48 |         root.debug('server connection: ' + current_url)
 49 |         try:
 50 |             server_out = requests.post(current_url, 
 51 |                                         data, 
 52 |                                         headers={'Connection': 'close'})
 53 |         except requests.exceptions.ConnectionError:
 54 |             root.error('Connection Error, check you have server running')
 55 |             raise Exception('Check your CoreNLP Server status \n'
 56 |                 'if not sure, Check the pywrap doc for Server instantiation')
 57 |         return server_out
 58 |     
 59 |    
 60 |     def url_calc(self, serializer=''):
 61 |         s_string = '/?properties={"annotators": "'
 62 |         anot_string = ','.join(self.annotator_list)
 63 |         m_string = '", "outputFormat": "' + self.out_format
 64 |         f_string = '", "serializer": "' + serializer + '"}'
 65 |         return self.url + s_string + anot_string + m_string + f_string
 66 | 
 67 | 
 68 |     def basic(self, data, out_format='json', serializer=''):
 69 |         self.out_format = out_format
 70 |         format_list = ['JSON', 'XML', 'TEXT', 'SERIALIZED']
 71 |         assert out_format.upper() in format_list, \
 72 |             'output format not supported, check stanford doc'
 73 |         
 74 |         if out_format.upper() == 'SERIALIZED' and not serializer:
 75 |             root.info(
 76 |                 'Default Serializer is using - ' + 
 77 |                 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer')
 78 |             serializer = ('edu.stanford.nlp.pipeline.'
 79 |                 'ProtobufAnnotationSerializer')
 80 |                 
 81 |         current_url = self.url_calc(serializer)
 82 |         assert isinstance(data, str) and data, 'Enter valid string input'
 83 |         
 84 |         return self.server_connection(current_url, data)
 85 | 
 86 |     @staticmethod
 87 |     def tokensregex(data, pattern='', custom_filter=''):
 88 |         root.info('TokenRegex started')
 89 |         return CoreNLP.regex('/tokensregex', data, pattern, custom_filter)
 90 | 
 91 |     @staticmethod
 92 |     def semgrex(data, pattern='', custom_filter=''):
 93 |         root.info('SemRegex started')
 94 |         return CoreNLP.regex('/semgrex', data, pattern, custom_filter)
 95 | 
 96 |     @staticmethod
 97 |     def tregex(data, pattern='', custom_filter=''):
 98 |         root.info('Tregex started')
 99 |         return CoreNLP.regex('/tregex', data, pattern, custom_filter)
100 | 
101 |     @classmethod
102 |     def regex(cls, endpoint, data, pattern, custom_filter):
103 |         url_string = '/?pattern=' + str(pattern) +'&filter=' + custom_filter 
104 |         current_url = cls.url + endpoint + url_string
105 |         root.info('Returning the data requested')
106 |         return cls.server_connection(current_url, data)
107 | 
108 |     @staticmethod
109 |     def process_sentences(sentences):
110 |         assert isinstance(sentences, list), 'it should be a list'
111 |         index = 0
112 |         new_index = 0
113 |         token_dict = {
114 |         'index':[],
115 |         'truecaseText':[],
116 |         'ner':[],
117 |         'before':[],
118 |         'originalText':[],
119 |         'characterOffsetBegin':[],
120 |         'lemma':[],
121 |         'truecase':[],
122 |         'pos':[],
123 |         'characterOffsetEnd':[],
124 |         'speaker':[],
125 |         'word':[],
126 |         'after':[],
127 |         'normalizedNER':[]
128 |         }
129 |         for sentence in sentences:
130 |             index = new_index
131 |             tokens = sentence['tokens']
132 |             for val in tokens:
133 | 
134 |                 #workaround to handle length inconsistancie with normalizedNER, rethink the logic
135 |                 if 'ner' in val.keys() and 'normalizedNER' not in val.keys():
136 |                     token_dict['normalizedNER'].append(0)
137 |                     
138 |                 for key, val in val.items():
139 |                     if key == 'index':
140 |                         new_index = index + int(val)
141 |                         token_dict[key].append(str(new_index))
142 |                     else:
143 |                         try:
144 |                             token_dict[key].append(val)
145 |                         except KeyError:
146 |                             token_dict[key] = [val]
147 |                             root.info('New key added: ' + key)
148 |         return token_dict
149 | 
150 | 
151 |     def arrange(self, data):
152 |         root.info('Executing custom function')
153 |         assert isinstance(data, str) and data, 'Enter valid string input'
154 |         if 'lemma' not in self.annotator_list:
155 |             self.annotator_list.append('lemma')
156 |         
157 |         current_url = self.url_calc()
158 |         r = self.server_connection(current_url, data)
159 |         try:
160 |             r = r.json()
161 |             rs = r['sentences']
162 |         except ValueError:
163 |             root.error('Value Error: '+r.text+', Check special chars in input')
164 |             rs = []
165 |         return self.process_sentences(rs)
166 | 


--------------------------------------------------------------------------------