├── .gitignore
├── LICENSE.rst
├── README.md
├── app
    ├── __init__.py
    ├── admin.py
    ├── constants.py
    ├── middleware
    │   ├── __init__.py
    │   └── validate_middleware.py
    ├── migrations
    │   └── __init__.py
    ├── models.py
    ├── tests.py
    ├── trained
    │   ├── bigram_editorial.pkl
    │   ├── bigram_humor.pkl
    │   ├── bigram_learned.pkl
    │   ├── bigram_news.pkl
    │   ├── bigram_religion.pkl
    │   ├── bigram_reviews.pkl
    │   ├── bigram_romance.pkl
    │   ├── bigram_science_fiction.pkl
    │   ├── unigram_editorial.pkl
    │   ├── unigram_humor.pkl
    │   ├── unigram_learned.pkl
    │   ├── unigram_news.pkl
    │   ├── unigram_religion.pkl
    │   ├── unigram_reviews.pkl
    │   ├── unigram_romance.pkl
    │   └── unigram_science_fiction.pkl
    ├── urls.py
    ├── util.py
    └── views.py
├── manage.py
├── nltk_api
    ├── __init__.py
    ├── settings.py
    ├── urls.py
    └── wsgi.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | nltk-api-server/
2 | *.pyc
3 | *.swp
4 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 Vipul Sharma
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | nltk-api-server
  2 | ===============
  3 | 
  4 | API server for NLTK. Aimed to provide convenient interface to use NLTK over
  5 | any programming language
  6 | 
  7 | Features
  8 | ========
  9 | * Stemming
 10 | * Lemmatization
 11 | * Part of Speech Tagging (Unigram/Bigram/Regex)
 12 | * Named Entity Recognition
 13 | * Sentiment Analysis
 14 | 
 15 | ---------------
 16 | **1. Stemming**
 17 | ---------------
 18 | 
 19 | NLTK Stemmers used: Porter, Snowball and Lancaster
 20 | 
 21 | * **Accepts:**
 22 | 
 23 |   * `/api/stem?words=<words>/`
 24 | 
 25 |   * `/api/stem?words=<words>&stemmer=<porter/snowball/lancaster/default>/`
 26 | 
 27 |   * `/api/stem?words=<words>&stemmer=snowball&language=<language>&ignore_stopwords=<true/false>/`
 28 | 
 29 | * **Query Parameters:**
 30 | 
 31 |   * Mandatory:
 32 |       * `words`:
 33 | 
 34 |           ***value:*** string comma separated
 35 |   * Optional:
 36 |       * `stemmer`:
 37 | 
 38 |           ***value:*** porter/snowball/lancaster/default
 39 | 
 40 |           ***default:*** snowball
 41 | 
 42 |       * `ignore_stopwords`: Only for Snowball Stemmer
 43 | 
 44 |            ***value:*** true/false
 45 | 
 46 |            ***default:*** false
 47 | 
 48 |       * `language`: Only for Snowball Stemmer
 49 | 
 50 |            ***value:*** see SnowballStemmer.languages
 51 | 
 52 |            ***default:*** english
 53 | 
 54 | ------------
 55 | 1.1 Examples
 56 | ------------
 57 | 
 58 | * `localhost:9000/api/stem?words=dangerous,monitoring,testing`
 59 | 
 60 |         {
 61 |             "status": true,
 62 |             "result": [
 63 |                 "danger",
 64 |                 "monitor",
 65 |                 "test"
 66 |             ]
 67 |         }
 68 | 
 69 | * `localhost:9000/api/stem/?words=dangerous,monitoring,testing&stemmer=snowball`
 70 | 
 71 |         {
 72 |             "status": true,
 73 |             "result": [
 74 |                 "dog",
 75 |                 "cat",
 76 |                 "vertex"
 77 |             ]
 78 |         }
 79 | 
 80 | * The above examples do not cover all cases. See the section above examples
 81 |   for more features
 82 | 
 83 | --------------------
 84 | **2. Lemmatization**
 85 | --------------------
 86 | 
 87 | NLTK Lemmatizer used: WordNetLemmatizer
 88 | 
 89 | * **Accepts:**
 90 | 
 91 |   * `/api/lemma?words=<words>/`
 92 | 
 93 | * **Query Parameters:**
 94 |    * Mandatory:
 95 |        * `words`:
 96 | 
 97 |            ***value:*** string comma separated
 98 | 
 99 | ------------
100 | 2.1 Examples
101 | ------------
102 | * `localhost/api/lemma/?words=dogs,cats,vertices`
103 | 
104 |         {
105 |             "status":true,
106 |             "result": [
107 |                 "dog",
108 |                 "cat",
109 |                 "vertex"
110 |             ]
111 |         }
112 | 
113 | -----------------------------
114 | **3. Part of Speech Tagging**
115 | -----------------------------
116 | 
117 | NLTK POS tagger used: pos_tag, UnigramTagger, BigramTagger & RegexpTagger
118 | 
119 | * **Accepts:**
120 | 
121 |   * `/api/tag?sentence=<sentence>/`
122 | 
123 |   * `/api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram/default>/`
124 | 
125 |   * `/api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram/default>&train=<categories>/`
126 | 
127 |   including any query parameter accepted by /api/tag/
128 | 
129 | * **Query Parameters:**
130 | 
131 |   * Mandatory:
132 |       * `sentence`:
133 | 
134 |           ***value:*** string
135 | 
136 |   * Optional:
137 | 
138 |       * `tagger`:
139 | 
140 |           ***value:*** pos/unigram/bigram/regex
141 | 
142 |           ***default:*** pos_tag
143 | 
144 |       * `train` (iff unigram/bigram):
145 | 
146 |           ***value:*** 'news', 'editorial', 'reviews', 'religion',
147 |                  'learned', 'science_fiction', 'romance', 'humor'
148 | 
149 |           ***default:*** 'news'
150 | 
151 |       * **any query parameter acceptable by /api/tag/**
152 | 
153 | ------------
154 | 3.1 Examples
155 | ------------
156 | * `localhost/api/sentence=this is a test`
157 | 
158 |         {
159 |             "status": true,
160 |             "result": [
161 |                 [
162 |                     "this",
163 |                     "DT"
164 |                 ],
165 |                 [
166 |                     "is",
167 |                     "VBZ"
168 |                 ],
169 |                 [
170 |                     "a",
171 |                     "DT"
172 |                 ],
173 |                 [
174 |                     "test",
175 |                     "NN"
176 |                 ]
177 |             ]
178 |         }
179 | 
180 | * `localhost/api/sentence=this is a test&tagger=unigram`
181 | 
182 |         {
183 |             "status": true,
184 |             "result": [
185 |                 [
186 |                     "this",
187 |                     "DT"
188 |                 ],
189 |                 [
190 |                     "is",
191 |                     "BEZ"
192 |                 ],
193 |                 [
194 |                     "a",
195 |                     "AT"
196 |                 ],
197 |                 [
198 |                     "test",
199 |                     "NN"
200 |                 ]
201 |             ]
202 |         }
203 | 
204 | * The above examples do not cover all cases. See the section above examples
205 |   for more features
206 | * **Remember, we can also use trained data along with the unigram/bigram tagger:
207 |     'news', 'editorial', 'reviews', 'religion', 'learned', 'science_fiction', 'romance', 'humor'**
208 | 
209 | -------------------------------
210 | **4. Named Entity Recognition**
211 | -------------------------------
212 | 
213 | NLTK NER used: ne_chunk
214 | 
215 | * Accepts:
216 | 
217 |   * `/api/ner?sentence=<sentence>/`
218 | 
219 |   including any query parameter accepted by /api/tag/
220 | 
221 | * Query Parameters:
222 | 
223 |   * Mandatory:
224 |       * `sentence`:
225 | 
226 |           ***value:*** string
227 | 
228 |       * Optional:
229 | 
230 |           * **any query parameter acceptable by /api/tag/**
231 | 
232 | ------------
233 | 4.1 Examples
234 | ------------
235 | 
236 | * `localhost/api/ner?sentence=At the Olympics in August, Phelps picked up five gold medal`
237 | 
238 |         {
239 |             "status": true,
240 |             "result": [
241 |                 "Phelps"
242 |             ]
243 |         }
244 | 
245 | -------------------------
246 | **5. Sentiment Analysis**
247 | -------------------------
248 | 
249 | NLTK Sentiment Analyzer used: vader
250 | 
251 | * **Accepts:**
252 | 
253 |   * `/api/sentiment?sentence=<sentence>/`
254 | 
255 | * **Query Parameters:**
256 | 
257 |   * Mandatory:
258 |       * `sentence`:
259 | 
260 |            ***value:*** string
261 | 
262 |   * Optional:
263 | 
264 |       * **any query parameter acceptable by /api/tag/**
265 | 
266 | ------------
267 | 5.1 Examples
268 | ------------
269 | 
270 | * `localhost/api/sentiment?sentence=At the Olympics in August, Phelps picked up five gold medal`
271 | 
272 |         {
273 |             "status": true,
274 |             "result": {
275 |                 "neg": 0,
276 |                 "neu": 0.256,
277 |                 "pos": 0.744,
278 |                 "compound": 0.4404
279 |             }
280 |         }
281 | 
282 | -------------------
283 | **6. Run on local**
284 | -------------------
285 | 
286 | * `git clone git@github.com:vipul-sharma20/nltk-api-server.git`
287 | * `cd nltk-api-server`
288 | * `sudo pip install virtualenv`
289 |      * Python 2.7.9 and later (on the python2 series), and Python 3.4 and later include pip by default, so you may have pip already.
290 |      * If you don't have pip installed, visit here to see steps to install virtualenv: [https://virtualenv.readthedocs.org/en/latest/installation.html](https://virtualenv.readthedocs.org/en/latest/installation.html)
291 | * `virtualenv nltk-api`
292 | * `source nltk-api/bin/activate`
293 | * `pip install -r requirements.txt` (wait till the requirements are installed)
294 | * `python manage.py runserver` This will run the application on [http://127.0.0.1:8000/](http://127.0.0.1:8000/)
295 | 
296 | **IMPORTANT:** You will require some corpora and trained models
297 | for the code to run. You can refer to: [http://www.nltk.org/data.html](http://www.nltk.org/data.html)
298 | 
299 | * Interactive Method:
300 | 
301 |         In [1]: import nltk
302 | 
303 |         In [2]: nltk.download()
304 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/__init__.py


--------------------------------------------------------------------------------
/app/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/app/constants.py:
--------------------------------------------------------------------------------
1 | DEFAULT_STEMMER = 'snowball'
2 | 
3 | DEFAULT_TOKENIZER = 'word'
4 | 
5 | DEFAULT_TAGGER = 'pos'
6 | TRAINERS = ['news', 'editorial', 'reviews', 'religion',
7 |              'learned', 'science_fiction', 'romance', 'humor']
8 | DEFAULT_TRAIN = 'news'


--------------------------------------------------------------------------------
/app/middleware/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/middleware/__init__.py


--------------------------------------------------------------------------------
/app/middleware/validate_middleware.py:
--------------------------------------------------------------------------------
 1 | from django.http import JsonResponse
 2 | 
 3 | 
 4 | class ValidateParameterMiddleware(object):
 5 | 
 6 |     params = {
 7 |             'words': ['stem', 'lemma'],
 8 |             'sentence': ['tokenize', 'ner', 'tag', 'sentiment']
 9 |             }
10 | 
11 |     def process_request(self, request):
12 |         if request.path != '/':
13 |             segments = request.path.split('/')
14 |             if len(segments) > 3:
15 |                 view_type = segments[2]
16 |                 message = ''
17 |                 if view_type in self.params['sentence'] and not \
18 |                         request.GET.get('sentence'):
19 |                     message = 'sentence'
20 |                 elif view_type in self.params['words'] and not \
21 |                         request.GET.get('words'):
22 |                     message = 'words'
23 |                 if message:
24 |                     return JsonResponse({
25 |                         'message': '%s parameter missing' % message,
26 |                         'status': False
27 |                     })
28 | 
29 | 


--------------------------------------------------------------------------------
/app/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/migrations/__init__.py


--------------------------------------------------------------------------------
/app/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 | 
3 | # Create your models here.
4 | 


--------------------------------------------------------------------------------
/app/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/app/trained/bigram_editorial.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_editorial.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_humor.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_humor.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_learned.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_learned.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_news.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_news.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_religion.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_religion.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_reviews.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_reviews.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_romance.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_romance.pkl


--------------------------------------------------------------------------------
/app/trained/bigram_science_fiction.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_science_fiction.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_editorial.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_editorial.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_humor.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_humor.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_learned.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_learned.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_news.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_news.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_religion.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_religion.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_reviews.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_reviews.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_romance.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_romance.pkl


--------------------------------------------------------------------------------
/app/trained/unigram_science_fiction.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_science_fiction.pkl


--------------------------------------------------------------------------------
/app/urls.py:
--------------------------------------------------------------------------------
 1 | from django.conf.urls import url
 2 | from app.views import StemView, TokenizeView, POSTagView, NERView, \
 3 |         LemmatizeView, HomeView, SentimentView
 4 | from rest_framework.urlpatterns import format_suffix_patterns
 5 | 
 6 | urlpatterns = [
 7 |         url(r'^api/$', HomeView.as_view()),
 8 |         url(r'^api/stem/$', StemView.as_view()),
 9 |         url(r'^api/tokenize/$', TokenizeView.as_view()),
10 |         url(r'^api/lemma/$', LemmatizeView.as_view()),
11 |         url(r'^api/tag/$', POSTagView.as_view()),
12 |         url(r'^api/ner/$', NERView.as_view()),
13 |         url(r'^api/sentiment/$', SentimentView.as_view()),
14 |         ]
15 | 
16 | urlpatterns += format_suffix_patterns(urlpatterns)
17 | 


--------------------------------------------------------------------------------
/app/util.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | from pickle import load, dump
  3 | 
  4 | import nltk
  5 | from nltk.stem.porter import PorterStemmer
  6 | from nltk.stem import SnowballStemmer, WordNetLemmatizer
  7 | from nltk.stem.lancaster import LancasterStemmer
  8 | from nltk import word_tokenize, pos_tag, UnigramTagger, BigramTagger, \
  9 |     RegexpTagger, ne_chunk, sent_tokenize
 10 | from nltk.tokenize import TweetTokenizer
 11 | from nltk.corpus import brown
 12 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
 13 | 
 14 | from app.constants import DEFAULT_STEMMER, DEFAULT_TOKENIZER, DEFAULT_TAGGER, \
 15 |     DEFAULT_TRAIN, TRAINERS
 16 | 
 17 | 
 18 | 
 19 | class NLTKStem(object):
 20 |     """
 21 |     NLTK Stemmers used: Porter, Snowball and Lancaster
 22 | 
 23 |     Accepts:
 24 | 
 25 |     /api/stem?words=<words>/
 26 |     /api/stem?words=<words>&stemmer=<porter/snowball/lancaster/default>/
 27 |     /api/stem?words=<words>&stemmer=snowball&language=<language>&ignore_stopwords=<true/false>/
 28 | 
 29 |     Query Parameters:
 30 | 
 31 |         * Mandatory:
 32 |             1. words:
 33 |                 type: string comma separated
 34 |         * Optional:
 35 |             1. stemmer:
 36 |                 value: porter/snowball/lancaster/default
 37 |                 default: snowball
 38 |             2. ignore_stopwords: Only for Snowball Stemmer
 39 |                 value: true/false
 40 |                 default: false
 41 |             3. language: Only for Snowball Stemmer
 42 |                 value: see SnowballStemmer.languages
 43 |                 default: english
 44 |     """
 45 | 
 46 |     dispatch = {
 47 |         'porter': PorterStemmer,
 48 |         'snowball': SnowballStemmer,
 49 |         'lancaster': LancasterStemmer,
 50 |     }
 51 |     dispatch['default'] = dispatch[DEFAULT_STEMMER]
 52 | 
 53 |     def __init__(self, options):
 54 |         self.options = options
 55 | 
 56 |     def stem(self):
 57 |         words = self._clean(self.options['words'])
 58 |         stemmer = self.options.get('stemmer', DEFAULT_STEMMER)
 59 |         stemmer_obj = self.dispatch.get(stemmer, self.dispatch[DEFAULT_STEMMER])
 60 |         result = []
 61 | 
 62 |         if stemmer_obj == SnowballStemmer:
 63 |             ignore_stopwords = False
 64 | 
 65 |             if self.options.get('ignore_stopwords'):
 66 |                 if self.options['ignore_stopwords'] == 'true':
 67 |                     ignore_stopwords = True
 68 |             language = self.options.get('language', 'english')
 69 | 
 70 |             result = [stemmer_obj(language, ignore_stopwords).stem(word)
 71 |                       for word in words]
 72 |         else:
 73 |             result = [stemmer_obj().stem(word) for word in words]
 74 | 
 75 |         return self._dump(result)
 76 | 
 77 |     def _clean(self, words):
 78 |         return words.split(',')
 79 | 
 80 |     def _dump(self, result):
 81 |         response = {
 82 |             'status': True,
 83 |             'result': result
 84 |         }
 85 |         return response
 86 | 
 87 | 
 88 | class NLTKTokenize(object):
 89 |     """
 90 |     NLTK Tokenizers used: word_tokenize, StringTokenizer, TweetTokenizer
 91 | 
 92 |     Accepts:
 93 | 
 94 |     /api/tokenize?sentence=<sentence>/
 95 |     /api/tokenize?sentence=<sentence>&tokenizer=<word/tweet/default>/
 96 | 
 97 |     Query Parameters:
 98 | 
 99 |         * Mandatory:
100 |             1. sentence:
101 |                 type: string
102 |         * Optional:
103 |             1. tokenizer:
104 |                 value: word/tweet/default
105 |                 default: word_tokenize
106 |     """
107 | 
108 |     dispatch = {
109 |         'word': word_tokenize,
110 |         'tweet': TweetTokenizer,
111 |     }
112 |     dispatch['default'] = dispatch[DEFAULT_TOKENIZER]
113 | 
114 |     def __init__(self, options):
115 |         self.options = options
116 | 
117 |     def tokenize(self):
118 |         tokenizer = self.options.get('tokenizer', DEFAULT_TOKENIZER)
119 |         tokenizer_obj = self.dispatch.get(tokenizer,
120 |                                           self.dispatch[DEFAULT_TOKENIZER])
121 | 
122 |         if tokenizer_obj == word_tokenize:
123 |             result = tokenizer_obj(self.options['sentence'])
124 | 
125 |         else:
126 |             result = tokenizer_obj().tokenize(self.options['sentence'])
127 | 
128 |         return self._dump(result)
129 | 
130 |     def _dump(self, result):
131 |         response = {
132 |             'status': True,
133 |             'result': result
134 |         }
135 |         return response
136 | 
137 | 
138 | class NLTKTag(object):
139 |     """
140 |     NLTK POS tagger used: pos_tag, UnigramTagger, BigramTagger & RegexpTagger
141 | 
142 |     Accepts:
143 | 
144 |     /api/tag?sentence=<sentence>/
145 |     /api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram/default>/
146 |     /api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram/default>&train=<categories>/
147 |     including any query parameter accepted by /api/tag/
148 | 
149 |     Query Parameters:
150 | 
151 |         * Mandatory:
152 |             1. sentence:
153 |                 type: string
154 | 
155 |         * Optional:
156 |             1. tagger:
157 |                 value: pos/unigram/bigram/regex
158 |                 default: pos_tag
159 |             2. train (iff unigram/bigram):
160 |                 value: 'news', 'editorial', 'reviews', 'religion',
161 |                        'learned', 'science_fiction', 'romance', 'humor'
162 |                 default: 'news'
163 |             3. any query parameter acceptable by /api/tag/
164 |     """
165 | 
166 |     def __init__(self, options):
167 |         self.options = options
168 | 
169 |     def pos_tag(self):
170 |         tokenize_obj = NLTKTokenize(self.options)
171 |         res = tokenize_obj.tokenize()
172 |         tokens = res['result']
173 |         tags = []
174 | 
175 |         # Performs Bigram / Unigram / Regex Tagging
176 |         if self.options.get('tagger') in ['unigram', 'bigram', 'regex']:
177 |             trainer = self.options['train'] if self.options.get(
178 |                 'train') in TRAINERS else DEFAULT_TRAIN
179 | 
180 |             train = brown.tagged_sents(categories=trainer)
181 | 
182 |             # Create your custom regex tagging pattern here
183 |             regex_tag = RegexpTagger([
184 |                 (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
185 |                 (r'.*able$', 'JJ'),
186 |                 (r'^[A-Z].*$', 'NNP'),
187 |                 (r'.*ly$', 'RB'),
188 |                 (r'.*s$', 'NNS'),
189 |                 (r'.*ing$', 'VBG'),
190 |                 (r'.*ed$', 'VBD'),
191 |                 (r'.*', 'NN')
192 |             ])
193 | 
194 |             current = os.path.dirname(os.path.abspath(__file__))
195 | 
196 |             # Unigram tag training data load / dump pickle
197 |             pkl_name = current + '/trained/unigram_' + trainer + '.pkl'
198 |             if os.path.isfile(pkl_name):
199 |                 with open(pkl_name, 'rb') as pkl:
200 |                     unigram_tag = load(pkl)
201 |             else:
202 |                 unigram_tag = UnigramTagger(train, backoff=regex_tag)
203 |                 with open(pkl_name, 'wb') as pkl:
204 |                     dump(unigram_tag, pkl, -1)
205 | 
206 |             # Bigram tag training data load / dump pickle
207 |             if self.options['tagger'] == 'bigram':
208 |                 pkl_name = current + '/trained/bigram_' + trainer + '.pkl'
209 |                 if os.path.isfile(pkl_name):
210 |                     with open(pkl_name, 'rb') as pkl:
211 |                         bigram_tag = load(pkl)
212 |                 else:
213 |                     bigram_tag = BigramTagger(train, backoff=unigram_tag)
214 |                     with open(pkl_name, 'wb') as pkl:
215 |                         dump(bigram_tag, pkl, -1)
216 |                 tags = bigram_tag.tag(tokens)  # Bigram tagging performed here
217 |             elif self.options['tagger'] == 'unigram':
218 |                 tags = unigram_tag.tag(tokens)  # Unigram tagging performed here
219 |             else:
220 |                 tags = regex_tag.tag(tokens)  # Regex tagging performed here
221 | 
222 |         # Performs default pos_tag
223 |         elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos':
224 |             tags = pos_tag(tokens)
225 | 
226 |         return self._dump(tags)
227 | 
228 |     @staticmethod
229 |     def _dump(result):
230 |         response = {
231 |             'status': True,
232 |             'result': result
233 |         }
234 |         return response
235 | 
236 | 
237 | class NLTKner(object):
238 |     """
239 |     NLTK NER used: ne_chunk
240 | 
241 |     Accepts:
242 | 
243 |     /api/ner?sentence=<sentence>/
244 |     including any query parameter accepted by /api/tag/
245 | 
246 |     Query Parameters:
247 | 
248 |         * Mandatory:
249 |             1. sentence:
250 |                 type: string
251 | 
252 |         * Optional:
253 |             1. any query parameter acceptable by /api/tag/
254 |     """
255 | 
256 |     def __init__(self, options):
257 |         self.options = options
258 | 
259 |     def ner(self):
260 |         pos_obj = NLTKTag(self.options)
261 |         res = pos_obj.pos_tag()
262 | 
263 |         tagged_sentence = res['result']
264 |         chunked_sentence = ne_chunk(tagged_sentence)
265 |         tokens = self._parse(chunked_sentence)
266 |         return self._dump(tokens)
267 | 
268 |     def _parse(self, tree):
269 |         n_tokens = []
270 |         for node in tree:
271 |             if isinstance(node, nltk.tree.Tree):
272 |                 if node.label() in ['NE', 'PERSON']:
273 |                     for leaf in node.leaves():
274 |                         n_tokens.append(leaf[0])
275 |         return n_tokens
276 | 
277 |     def _dump(self, tokens):
278 |         response = {
279 |                 'status': True,
280 |                 'result': tokens
281 |                 }
282 |         return response
283 | 
284 | 
285 | class NLTKLemmatize(object):
286 |     """
287 |     NLTK Lemmatizer used: WordNetLemmatizer
288 | 
289 |     Accepts:
290 |     /api/lemma?words=<words>/
291 | 
292 |     Query Parameters:
293 | 
294 |         * Mandatory:
295 |             1. words:
296 |                 type: string comma separated
297 |     """
298 | 
299 |     def __init__(self, options):
300 |         self.options = options
301 | 
302 |     def lemma(self):
303 |         lemma_obj = WordNetLemmatizer()
304 |         words = self._clean(self.options['words'])
305 |         result = [lemma_obj.lemmatize(word) for word in words]
306 |         return self._dump(result)
307 | 
308 |     def _clean(self, words):
309 |         return words.split(',')
310 | 
311 |     def _dump(self, result):
312 |         response = {
313 |                 'status': True,
314 |                 'result': result
315 |                 }
316 |         return response
317 | 
318 | 
319 | class NLTKSentiment(object):
320 |     """
321 |     NLTK Sentiment Analyzer used: vader
322 | 
323 |     Accepts:
324 |     /api/sentiment?sentence=<sentence>/
325 | 
326 |     Query Parameters:
327 | 
328 |         * Mandatory:
329 |             1. sentence:
330 |                 type: string
331 |     """
332 | 
333 |     def __init__(self, options):
334 |         self.options = options
335 | 
336 |     def sentiment(self):
337 |         sentiment_obj = SentimentIntensityAnalyzer()
338 |         result = sentiment_obj.polarity_scores(self.options['sentence'])
339 |         return self._dump(result)
340 | 
341 |     def _dump(self, result):
342 |         response = {
343 |                 'status': True,
344 |                 'result': result
345 |                 }
346 |         return response
347 | 
348 | 


--------------------------------------------------------------------------------
/app/views.py:
--------------------------------------------------------------------------------
  1 | from rest_framework.views import APIView
  2 | from rest_framework.response import Response
  3 | 
  4 | from util import NLTKStem, NLTKTokenize, NLTKTag, NLTKner, NLTKLemmatize, \
  5 |         NLTKSentiment
  6 | 
  7 | 
  8 | class HomeView(APIView):
  9 |     """
 10 |     Landing Page View
 11 |     """
 12 | 
 13 |     def get(self, request):
 14 |         res = {
 15 |             'examples': [
 16 |                 '/api/stem?words=<word1,word2,word3>',
 17 |                 '/api/stem?words=<word1,word2>&stemmer=<porter/snowball/lancaster/default>',
 18 |                 '/api/stem?words=<word1,word2>&stemmer=snowball&language<language>&ignore_stopwords=<true/false>',
 19 |                 '/api/tokenize?sentence=<sentence>',
 20 |                 '/api/tokenize?sentence=<sentence>&tokenizer=<word/tweet/default>',
 21 |                 '/api/tag?sentence=<sentence>',
 22 |                 '/api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram>',
 23 |                 '/api/tag?sentence=<sentence>&tagger=<pos/unigram/bigram>&train=<categories>',
 24 |                 '/api/tag?sentence=<sentence>&tokenizer=<word/tweet/default>',
 25 |                 '/api/ner?sentence=<sentence>',
 26 |                 '/api/ner?sentence=<sentence>&tokenizer=<word/tweet/default>',
 27 |                 '/api/sentiment?sentence=<sentence>'
 28 |             ],
 29 |             'message': 'see app/util.py for details',
 30 |             'repository': 'https://github.com/vipul-sharma20/nltk-api-server',
 31 |         }
 32 |         return Response(res)
 33 | 
 34 | 
 35 | class StemView(APIView):
 36 |     """
 37 |     View for Stemming words
 38 |     """
 39 | 
 40 |     def get(self, request):
 41 |         data = request.GET
 42 |         stem_obj = NLTKStem(data)
 43 |         res = stem_obj.stem()
 44 | 
 45 |         return Response(res)
 46 | 
 47 | 
 48 | class TokenizeView(APIView):
 49 |     """
 50 |     View for Tokenizing strings
 51 |     """
 52 | 
 53 |     def get(self, request):
 54 |         data = request.GET
 55 |         tokenize_obj = NLTKTokenize(data)
 56 |         res = tokenize_obj.tokenize()
 57 | 
 58 |         return Response(res)
 59 | 
 60 | 
 61 | class POSTagView(APIView):
 62 |     """
 63 |     View for Part Of Speech tagging
 64 |     """
 65 | 
 66 |     def get(self, request):
 67 |         data = request.GET
 68 |         pos_obj = NLTKTag(data)
 69 |         res = pos_obj.pos_tag()
 70 | 
 71 |         return Response(res)
 72 | 
 73 | 
 74 | class NERView(APIView):
 75 |     """
 76 |     View for Named Entity Recognition
 77 |     """
 78 | 
 79 |     def get(self, request):
 80 |         data = request.GET
 81 |         ner_obj = NLTKner(data)
 82 |         res = ner_obj.ner()
 83 | 
 84 |         return Response(res)
 85 | 
 86 | 
 87 | class LemmatizeView(APIView):
 88 |     """
 89 |     View for Lemmatization
 90 |     """
 91 | 
 92 |     def get(self, request):
 93 |         data = request.GET
 94 |         lemma_obj = NLTKLemmatize(data)
 95 |         res = lemma_obj.lemma()
 96 | 
 97 |         return Response(res)
 98 | 
 99 | 
100 | class SentimentView(APIView):
101 |     """
102 |     View for Sentiment Analysis
103 |     """
104 | 
105 |     def get(self, request):
106 |         data = request.GET
107 |         senti_obj = NLTKSentiment(data)
108 |         res = senti_obj.sentiment()
109 | 
110 |         return Response(res)
111 | 
112 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nltk_api.settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------
/nltk_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/nltk_api/__init__.py


--------------------------------------------------------------------------------
/nltk_api/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for nltk_api project.
  3 | """
  4 | 
  5 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
  6 | import os
  7 | 
  8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  9 | 
 10 | # SECURITY WARNING: keep the secret key used in production secret!
 11 | SECRET_KEY = '3!4c6)^ggxuf!gbk$q2*ea52*oe+%)d%+!(td9mm5^6o_)bir2'
 12 | 
 13 | # SECURITY WARNING: don't run with debug turned on in production!
 14 | DEBUG = True
 15 | 
 16 | ALLOWED_HOSTS = []
 17 | 
 18 | 
 19 | # Application definition
 20 | 
 21 | INSTALLED_APPS = (
 22 |     'django.contrib.admin',
 23 |     'django.contrib.auth',
 24 |     'django.contrib.contenttypes',
 25 |     'django.contrib.sessions',
 26 |     'django.contrib.messages',
 27 |     'django.contrib.staticfiles',
 28 |     'rest_framework',
 29 | )
 30 | 
 31 | MIDDLEWARE_CLASSES = (
 32 |     'django.contrib.sessions.middleware.SessionMiddleware',
 33 |     'django.middleware.common.CommonMiddleware',
 34 |     'django.middleware.csrf.CsrfViewMiddleware',
 35 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 36 |     'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
 37 |     'django.contrib.messages.middleware.MessageMiddleware',
 38 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 39 |     'django.middleware.security.SecurityMiddleware',
 40 |     'app.middleware.validate_middleware.ValidateParameterMiddleware',
 41 | )
 42 | 
 43 | ROOT_URLCONF = 'nltk_api.urls'
 44 | 
 45 | TEMPLATES = [
 46 |     {
 47 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 48 |         'DIRS': [os.path.join(BASE_DIR, 'app/templates/rest_framework')],
 49 |         'APP_DIRS': True,
 50 |         'OPTIONS': {
 51 |             'context_processors': [
 52 |                 'django.template.context_processors.debug',
 53 |                 'django.template.context_processors.request',
 54 |                 'django.contrib.auth.context_processors.auth',
 55 |                 'django.contrib.messages.context_processors.messages',
 56 |             ],
 57 |         },
 58 |     },
 59 | ]
 60 | 
 61 | WSGI_APPLICATION = 'nltk_api.wsgi.application'
 62 | 
 63 | 
 64 | # Database
 65 | 
 66 | DATABASES = {
 67 |     'default': {
 68 |         'ENGINE': 'django.db.backends.sqlite3',
 69 |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 70 |     }
 71 | }
 72 | 
 73 | # REST frame
 74 | #REST_FRAMEWORK = {
 75 |     #'DEFAULT_PERMISSION_CLASSES': [
 76 |     #    'rest_framework.permissions.DjangoModelPermissionsOrAnonReadOnly'
 77 |     #]
 78 | #}
 79 | 
 80 | REST_FRAMEWORK = {
 81 |     'DEFAULT_RENDERER_CLASSES': (
 82 |         'rest_framework.renderers.JSONRenderer',
 83 |         'rest_framework.renderers.BrowsableAPIRenderer',
 84 |     ),
 85 | 	'DEFAULT_THROTTLE_CLASSES': (
 86 |         'rest_framework.throttling.AnonRateThrottle',
 87 |     ),
 88 |     'DEFAULT_THROTTLE_RATES': {
 89 |         'anon': '100/day',
 90 |     }
 91 | }
 92 | 
 93 | # Internationalization
 94 | 
 95 | LANGUAGE_CODE = 'en-us'
 96 | 
 97 | TIME_ZONE = 'UTC'
 98 | 
 99 | USE_I18N = True
100 | 
101 | USE_L10N = True
102 | 
103 | USE_TZ = True
104 | 
105 | 
106 | # Static files (CSS, JavaScript, Images)
107 | 
108 | STATIC_URL = '/static/'
109 | 


--------------------------------------------------------------------------------
/nltk_api/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | nltk_api URL Configuration
 3 | """
 4 | import app
 5 | from app import urls
 6 | 
 7 | from django.conf.urls import url, include
 8 | from django.contrib import admin
 9 | 
10 | from rest_framework import routers
11 | 
12 | router = routers.DefaultRouter()
13 | 
14 | urlpatterns = [
15 |     url(r'^', include(router.urls)),
16 |     url(r'^', include(app.urls)),
17 |     url(r'^admin/', include(admin.site.urls)),
18 | ]
19 | 
20 | 


--------------------------------------------------------------------------------
/nltk_api/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for nltk_api project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nltk_api.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Django==1.8
2 | djangorestframework==3.5.3
3 | httpie==0.9.6
4 | nltk==3.2.1
5 | Pygments==2.1.3
6 | requests==2.12.1
7 | twython==3.4.0
8 | numpy==1.11.2
9 | 


--------------------------------------------------------------------------------