├── .gitignore ├── LICENSE.rst ├── README.md ├── app ├── __init__.py ├── admin.py ├── constants.py ├── middleware │ ├── __init__.py │ └── validate_middleware.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py ├── trained │ ├── bigram_editorial.pkl │ ├── bigram_humor.pkl │ ├── bigram_learned.pkl │ ├── bigram_news.pkl │ ├── bigram_religion.pkl │ ├── bigram_reviews.pkl │ ├── bigram_romance.pkl │ ├── bigram_science_fiction.pkl │ ├── unigram_editorial.pkl │ ├── unigram_humor.pkl │ ├── unigram_learned.pkl │ ├── unigram_news.pkl │ ├── unigram_religion.pkl │ ├── unigram_reviews.pkl │ ├── unigram_romance.pkl │ └── unigram_science_fiction.pkl ├── urls.py ├── util.py └── views.py ├── manage.py ├── nltk_api ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | nltk-api-server/ 2 | *.pyc 3 | *.swp 4 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Vipul Sharma 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | nltk-api-server 2 | =============== 3 | 4 | API server for NLTK. Aimed to provide convenient interface to use NLTK over 5 | any programming language 6 | 7 | Features 8 | ======== 9 | * Stemming 10 | * Lemmatization 11 | * Part of Speech Tagging (Unigram/Bigram/Regex) 12 | * Named Entity Recognition 13 | * Sentiment Analysis 14 | 15 | --------------- 16 | **1. Stemming** 17 | --------------- 18 | 19 | NLTK Stemmers used: Porter, Snowball and Lancaster 20 | 21 | * **Accepts:** 22 | 23 | * `/api/stem?words=/` 24 | 25 | * `/api/stem?words=&stemmer=/` 26 | 27 | * `/api/stem?words=&stemmer=snowball&language=&ignore_stopwords=/` 28 | 29 | * **Query Parameters:** 30 | 31 | * Mandatory: 32 | * `words`: 33 | 34 | ***value:*** string comma separated 35 | * Optional: 36 | * `stemmer`: 37 | 38 | ***value:*** porter/snowball/lancaster/default 39 | 40 | ***default:*** snowball 41 | 42 | * `ignore_stopwords`: Only for Snowball Stemmer 43 | 44 | ***value:*** true/false 45 | 46 | ***default:*** false 47 | 48 | * `language`: Only for Snowball Stemmer 49 | 50 | ***value:*** see SnowballStemmer.languages 51 | 52 | ***default:*** english 53 | 54 | ------------ 55 | 1.1 Examples 56 | ------------ 57 | 58 | * `localhost:9000/api/stem?words=dangerous,monitoring,testing` 59 | 60 | { 61 | "status": true, 62 | "result": [ 63 | "danger", 64 | "monitor", 65 | "test" 66 | ] 67 | } 68 | 69 | * `localhost:9000/api/stem/?words=dangerous,monitoring,testing&stemmer=snowball` 70 | 71 | { 72 | "status": true, 73 | "result": [ 74 | "dog", 75 | "cat", 76 | "vertex" 77 | ] 78 | } 79 | 80 | * The above examples do not cover all cases. See the section above examples 81 | for more features 82 | 83 | -------------------- 84 | **2. Lemmatization** 85 | -------------------- 86 | 87 | NLTK Lemmatizer used: WordNetLemmatizer 88 | 89 | * **Accepts:** 90 | 91 | * `/api/lemma?words=/` 92 | 93 | * **Query Parameters:** 94 | * Mandatory: 95 | * `words`: 96 | 97 | ***value:*** string comma separated 98 | 99 | ------------ 100 | 2.1 Examples 101 | ------------ 102 | * `localhost/api/lemma/?words=dogs,cats,vertices` 103 | 104 | { 105 | "status":true, 106 | "result": [ 107 | "dog", 108 | "cat", 109 | "vertex" 110 | ] 111 | } 112 | 113 | ----------------------------- 114 | **3. Part of Speech Tagging** 115 | ----------------------------- 116 | 117 | NLTK POS tagger used: pos_tag, UnigramTagger, BigramTagger & RegexpTagger 118 | 119 | * **Accepts:** 120 | 121 | * `/api/tag?sentence=/` 122 | 123 | * `/api/tag?sentence=&tagger=/` 124 | 125 | * `/api/tag?sentence=&tagger=&train=/` 126 | 127 | including any query parameter accepted by /api/tag/ 128 | 129 | * **Query Parameters:** 130 | 131 | * Mandatory: 132 | * `sentence`: 133 | 134 | ***value:*** string 135 | 136 | * Optional: 137 | 138 | * `tagger`: 139 | 140 | ***value:*** pos/unigram/bigram/regex 141 | 142 | ***default:*** pos_tag 143 | 144 | * `train` (iff unigram/bigram): 145 | 146 | ***value:*** 'news', 'editorial', 'reviews', 'religion', 147 | 'learned', 'science_fiction', 'romance', 'humor' 148 | 149 | ***default:*** 'news' 150 | 151 | * **any query parameter acceptable by /api/tag/** 152 | 153 | ------------ 154 | 3.1 Examples 155 | ------------ 156 | * `localhost/api/sentence=this is a test` 157 | 158 | { 159 | "status": true, 160 | "result": [ 161 | [ 162 | "this", 163 | "DT" 164 | ], 165 | [ 166 | "is", 167 | "VBZ" 168 | ], 169 | [ 170 | "a", 171 | "DT" 172 | ], 173 | [ 174 | "test", 175 | "NN" 176 | ] 177 | ] 178 | } 179 | 180 | * `localhost/api/sentence=this is a test&tagger=unigram` 181 | 182 | { 183 | "status": true, 184 | "result": [ 185 | [ 186 | "this", 187 | "DT" 188 | ], 189 | [ 190 | "is", 191 | "BEZ" 192 | ], 193 | [ 194 | "a", 195 | "AT" 196 | ], 197 | [ 198 | "test", 199 | "NN" 200 | ] 201 | ] 202 | } 203 | 204 | * The above examples do not cover all cases. See the section above examples 205 | for more features 206 | * **Remember, we can also use trained data along with the unigram/bigram tagger: 207 | 'news', 'editorial', 'reviews', 'religion', 'learned', 'science_fiction', 'romance', 'humor'** 208 | 209 | ------------------------------- 210 | **4. Named Entity Recognition** 211 | ------------------------------- 212 | 213 | NLTK NER used: ne_chunk 214 | 215 | * Accepts: 216 | 217 | * `/api/ner?sentence=/` 218 | 219 | including any query parameter accepted by /api/tag/ 220 | 221 | * Query Parameters: 222 | 223 | * Mandatory: 224 | * `sentence`: 225 | 226 | ***value:*** string 227 | 228 | * Optional: 229 | 230 | * **any query parameter acceptable by /api/tag/** 231 | 232 | ------------ 233 | 4.1 Examples 234 | ------------ 235 | 236 | * `localhost/api/ner?sentence=At the Olympics in August, Phelps picked up five gold medal` 237 | 238 | { 239 | "status": true, 240 | "result": [ 241 | "Phelps" 242 | ] 243 | } 244 | 245 | ------------------------- 246 | **5. Sentiment Analysis** 247 | ------------------------- 248 | 249 | NLTK Sentiment Analyzer used: vader 250 | 251 | * **Accepts:** 252 | 253 | * `/api/sentiment?sentence=/` 254 | 255 | * **Query Parameters:** 256 | 257 | * Mandatory: 258 | * `sentence`: 259 | 260 | ***value:*** string 261 | 262 | * Optional: 263 | 264 | * **any query parameter acceptable by /api/tag/** 265 | 266 | ------------ 267 | 5.1 Examples 268 | ------------ 269 | 270 | * `localhost/api/sentiment?sentence=At the Olympics in August, Phelps picked up five gold medal` 271 | 272 | { 273 | "status": true, 274 | "result": { 275 | "neg": 0, 276 | "neu": 0.256, 277 | "pos": 0.744, 278 | "compound": 0.4404 279 | } 280 | } 281 | 282 | ------------------- 283 | **6. Run on local** 284 | ------------------- 285 | 286 | * `git clone git@github.com:vipul-sharma20/nltk-api-server.git` 287 | * `cd nltk-api-server` 288 | * `sudo pip install virtualenv` 289 | * Python 2.7.9 and later (on the python2 series), and Python 3.4 and later include pip by default, so you may have pip already. 290 | * If you don't have pip installed, visit here to see steps to install virtualenv: [https://virtualenv.readthedocs.org/en/latest/installation.html](https://virtualenv.readthedocs.org/en/latest/installation.html) 291 | * `virtualenv nltk-api` 292 | * `source nltk-api/bin/activate` 293 | * `pip install -r requirements.txt` (wait till the requirements are installed) 294 | * `python manage.py runserver` This will run the application on [http://127.0.0.1:8000/](http://127.0.0.1:8000/) 295 | 296 | **IMPORTANT:** You will require some corpora and trained models 297 | for the code to run. You can refer to: [http://www.nltk.org/data.html](http://www.nltk.org/data.html) 298 | 299 | * Interactive Method: 300 | 301 | In [1]: import nltk 302 | 303 | In [2]: nltk.download() 304 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/__init__.py -------------------------------------------------------------------------------- /app/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /app/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_STEMMER = 'snowball' 2 | 3 | DEFAULT_TOKENIZER = 'word' 4 | 5 | DEFAULT_TAGGER = 'pos' 6 | TRAINERS = ['news', 'editorial', 'reviews', 'religion', 7 | 'learned', 'science_fiction', 'romance', 'humor'] 8 | DEFAULT_TRAIN = 'news' -------------------------------------------------------------------------------- /app/middleware/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/middleware/__init__.py -------------------------------------------------------------------------------- /app/middleware/validate_middleware.py: -------------------------------------------------------------------------------- 1 | from django.http import JsonResponse 2 | 3 | 4 | class ValidateParameterMiddleware(object): 5 | 6 | params = { 7 | 'words': ['stem', 'lemma'], 8 | 'sentence': ['tokenize', 'ner', 'tag', 'sentiment'] 9 | } 10 | 11 | def process_request(self, request): 12 | if request.path != '/': 13 | segments = request.path.split('/') 14 | if len(segments) > 3: 15 | view_type = segments[2] 16 | message = '' 17 | if view_type in self.params['sentence'] and not \ 18 | request.GET.get('sentence'): 19 | message = 'sentence' 20 | elif view_type in self.params['words'] and not \ 21 | request.GET.get('words'): 22 | message = 'words' 23 | if message: 24 | return JsonResponse({ 25 | 'message': '%s parameter missing' % message, 26 | 'status': False 27 | }) 28 | 29 | -------------------------------------------------------------------------------- /app/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/migrations/__init__.py -------------------------------------------------------------------------------- /app/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /app/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /app/trained/bigram_editorial.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_editorial.pkl -------------------------------------------------------------------------------- /app/trained/bigram_humor.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_humor.pkl -------------------------------------------------------------------------------- /app/trained/bigram_learned.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_learned.pkl -------------------------------------------------------------------------------- /app/trained/bigram_news.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_news.pkl -------------------------------------------------------------------------------- /app/trained/bigram_religion.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_religion.pkl -------------------------------------------------------------------------------- /app/trained/bigram_reviews.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_reviews.pkl -------------------------------------------------------------------------------- /app/trained/bigram_romance.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_romance.pkl -------------------------------------------------------------------------------- /app/trained/bigram_science_fiction.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/bigram_science_fiction.pkl -------------------------------------------------------------------------------- /app/trained/unigram_editorial.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_editorial.pkl -------------------------------------------------------------------------------- /app/trained/unigram_humor.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_humor.pkl -------------------------------------------------------------------------------- /app/trained/unigram_learned.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_learned.pkl -------------------------------------------------------------------------------- /app/trained/unigram_news.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_news.pkl -------------------------------------------------------------------------------- /app/trained/unigram_religion.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_religion.pkl -------------------------------------------------------------------------------- /app/trained/unigram_reviews.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_reviews.pkl -------------------------------------------------------------------------------- /app/trained/unigram_romance.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_romance.pkl -------------------------------------------------------------------------------- /app/trained/unigram_science_fiction.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/app/trained/unigram_science_fiction.pkl -------------------------------------------------------------------------------- /app/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url 2 | from app.views import StemView, TokenizeView, POSTagView, NERView, \ 3 | LemmatizeView, HomeView, SentimentView 4 | from rest_framework.urlpatterns import format_suffix_patterns 5 | 6 | urlpatterns = [ 7 | url(r'^api/$', HomeView.as_view()), 8 | url(r'^api/stem/$', StemView.as_view()), 9 | url(r'^api/tokenize/$', TokenizeView.as_view()), 10 | url(r'^api/lemma/$', LemmatizeView.as_view()), 11 | url(r'^api/tag/$', POSTagView.as_view()), 12 | url(r'^api/ner/$', NERView.as_view()), 13 | url(r'^api/sentiment/$', SentimentView.as_view()), 14 | ] 15 | 16 | urlpatterns += format_suffix_patterns(urlpatterns) 17 | -------------------------------------------------------------------------------- /app/util.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from pickle import load, dump 3 | 4 | import nltk 5 | from nltk.stem.porter import PorterStemmer 6 | from nltk.stem import SnowballStemmer, WordNetLemmatizer 7 | from nltk.stem.lancaster import LancasterStemmer 8 | from nltk import word_tokenize, pos_tag, UnigramTagger, BigramTagger, \ 9 | RegexpTagger, ne_chunk, sent_tokenize 10 | from nltk.tokenize import TweetTokenizer 11 | from nltk.corpus import brown 12 | from nltk.sentiment.vader import SentimentIntensityAnalyzer 13 | 14 | from app.constants import DEFAULT_STEMMER, DEFAULT_TOKENIZER, DEFAULT_TAGGER, \ 15 | DEFAULT_TRAIN, TRAINERS 16 | 17 | 18 | 19 | class NLTKStem(object): 20 | """ 21 | NLTK Stemmers used: Porter, Snowball and Lancaster 22 | 23 | Accepts: 24 | 25 | /api/stem?words=/ 26 | /api/stem?words=&stemmer=/ 27 | /api/stem?words=&stemmer=snowball&language=&ignore_stopwords=/ 28 | 29 | Query Parameters: 30 | 31 | * Mandatory: 32 | 1. words: 33 | type: string comma separated 34 | * Optional: 35 | 1. stemmer: 36 | value: porter/snowball/lancaster/default 37 | default: snowball 38 | 2. ignore_stopwords: Only for Snowball Stemmer 39 | value: true/false 40 | default: false 41 | 3. language: Only for Snowball Stemmer 42 | value: see SnowballStemmer.languages 43 | default: english 44 | """ 45 | 46 | dispatch = { 47 | 'porter': PorterStemmer, 48 | 'snowball': SnowballStemmer, 49 | 'lancaster': LancasterStemmer, 50 | } 51 | dispatch['default'] = dispatch[DEFAULT_STEMMER] 52 | 53 | def __init__(self, options): 54 | self.options = options 55 | 56 | def stem(self): 57 | words = self._clean(self.options['words']) 58 | stemmer = self.options.get('stemmer', DEFAULT_STEMMER) 59 | stemmer_obj = self.dispatch.get(stemmer, self.dispatch[DEFAULT_STEMMER]) 60 | result = [] 61 | 62 | if stemmer_obj == SnowballStemmer: 63 | ignore_stopwords = False 64 | 65 | if self.options.get('ignore_stopwords'): 66 | if self.options['ignore_stopwords'] == 'true': 67 | ignore_stopwords = True 68 | language = self.options.get('language', 'english') 69 | 70 | result = [stemmer_obj(language, ignore_stopwords).stem(word) 71 | for word in words] 72 | else: 73 | result = [stemmer_obj().stem(word) for word in words] 74 | 75 | return self._dump(result) 76 | 77 | def _clean(self, words): 78 | return words.split(',') 79 | 80 | def _dump(self, result): 81 | response = { 82 | 'status': True, 83 | 'result': result 84 | } 85 | return response 86 | 87 | 88 | class NLTKTokenize(object): 89 | """ 90 | NLTK Tokenizers used: word_tokenize, StringTokenizer, TweetTokenizer 91 | 92 | Accepts: 93 | 94 | /api/tokenize?sentence=/ 95 | /api/tokenize?sentence=&tokenizer=/ 96 | 97 | Query Parameters: 98 | 99 | * Mandatory: 100 | 1. sentence: 101 | type: string 102 | * Optional: 103 | 1. tokenizer: 104 | value: word/tweet/default 105 | default: word_tokenize 106 | """ 107 | 108 | dispatch = { 109 | 'word': word_tokenize, 110 | 'tweet': TweetTokenizer, 111 | } 112 | dispatch['default'] = dispatch[DEFAULT_TOKENIZER] 113 | 114 | def __init__(self, options): 115 | self.options = options 116 | 117 | def tokenize(self): 118 | tokenizer = self.options.get('tokenizer', DEFAULT_TOKENIZER) 119 | tokenizer_obj = self.dispatch.get(tokenizer, 120 | self.dispatch[DEFAULT_TOKENIZER]) 121 | 122 | if tokenizer_obj == word_tokenize: 123 | result = tokenizer_obj(self.options['sentence']) 124 | 125 | else: 126 | result = tokenizer_obj().tokenize(self.options['sentence']) 127 | 128 | return self._dump(result) 129 | 130 | def _dump(self, result): 131 | response = { 132 | 'status': True, 133 | 'result': result 134 | } 135 | return response 136 | 137 | 138 | class NLTKTag(object): 139 | """ 140 | NLTK POS tagger used: pos_tag, UnigramTagger, BigramTagger & RegexpTagger 141 | 142 | Accepts: 143 | 144 | /api/tag?sentence=/ 145 | /api/tag?sentence=&tagger=/ 146 | /api/tag?sentence=&tagger=&train=/ 147 | including any query parameter accepted by /api/tag/ 148 | 149 | Query Parameters: 150 | 151 | * Mandatory: 152 | 1. sentence: 153 | type: string 154 | 155 | * Optional: 156 | 1. tagger: 157 | value: pos/unigram/bigram/regex 158 | default: pos_tag 159 | 2. train (iff unigram/bigram): 160 | value: 'news', 'editorial', 'reviews', 'religion', 161 | 'learned', 'science_fiction', 'romance', 'humor' 162 | default: 'news' 163 | 3. any query parameter acceptable by /api/tag/ 164 | """ 165 | 166 | def __init__(self, options): 167 | self.options = options 168 | 169 | def pos_tag(self): 170 | tokenize_obj = NLTKTokenize(self.options) 171 | res = tokenize_obj.tokenize() 172 | tokens = res['result'] 173 | tags = [] 174 | 175 | # Performs Bigram / Unigram / Regex Tagging 176 | if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: 177 | trainer = self.options['train'] if self.options.get( 178 | 'train') in TRAINERS else DEFAULT_TRAIN 179 | 180 | train = brown.tagged_sents(categories=trainer) 181 | 182 | # Create your custom regex tagging pattern here 183 | regex_tag = RegexpTagger([ 184 | (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), 185 | (r'.*able$', 'JJ'), 186 | (r'^[A-Z].*$', 'NNP'), 187 | (r'.*ly$', 'RB'), 188 | (r'.*s$', 'NNS'), 189 | (r'.*ing$', 'VBG'), 190 | (r'.*ed$', 'VBD'), 191 | (r'.*', 'NN') 192 | ]) 193 | 194 | current = os.path.dirname(os.path.abspath(__file__)) 195 | 196 | # Unigram tag training data load / dump pickle 197 | pkl_name = current + '/trained/unigram_' + trainer + '.pkl' 198 | if os.path.isfile(pkl_name): 199 | with open(pkl_name, 'rb') as pkl: 200 | unigram_tag = load(pkl) 201 | else: 202 | unigram_tag = UnigramTagger(train, backoff=regex_tag) 203 | with open(pkl_name, 'wb') as pkl: 204 | dump(unigram_tag, pkl, -1) 205 | 206 | # Bigram tag training data load / dump pickle 207 | if self.options['tagger'] == 'bigram': 208 | pkl_name = current + '/trained/bigram_' + trainer + '.pkl' 209 | if os.path.isfile(pkl_name): 210 | with open(pkl_name, 'rb') as pkl: 211 | bigram_tag = load(pkl) 212 | else: 213 | bigram_tag = BigramTagger(train, backoff=unigram_tag) 214 | with open(pkl_name, 'wb') as pkl: 215 | dump(bigram_tag, pkl, -1) 216 | tags = bigram_tag.tag(tokens) # Bigram tagging performed here 217 | elif self.options['tagger'] == 'unigram': 218 | tags = unigram_tag.tag(tokens) # Unigram tagging performed here 219 | else: 220 | tags = regex_tag.tag(tokens) # Regex tagging performed here 221 | 222 | # Performs default pos_tag 223 | elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': 224 | tags = pos_tag(tokens) 225 | 226 | return self._dump(tags) 227 | 228 | @staticmethod 229 | def _dump(result): 230 | response = { 231 | 'status': True, 232 | 'result': result 233 | } 234 | return response 235 | 236 | 237 | class NLTKner(object): 238 | """ 239 | NLTK NER used: ne_chunk 240 | 241 | Accepts: 242 | 243 | /api/ner?sentence=/ 244 | including any query parameter accepted by /api/tag/ 245 | 246 | Query Parameters: 247 | 248 | * Mandatory: 249 | 1. sentence: 250 | type: string 251 | 252 | * Optional: 253 | 1. any query parameter acceptable by /api/tag/ 254 | """ 255 | 256 | def __init__(self, options): 257 | self.options = options 258 | 259 | def ner(self): 260 | pos_obj = NLTKTag(self.options) 261 | res = pos_obj.pos_tag() 262 | 263 | tagged_sentence = res['result'] 264 | chunked_sentence = ne_chunk(tagged_sentence) 265 | tokens = self._parse(chunked_sentence) 266 | return self._dump(tokens) 267 | 268 | def _parse(self, tree): 269 | n_tokens = [] 270 | for node in tree: 271 | if isinstance(node, nltk.tree.Tree): 272 | if node.label() in ['NE', 'PERSON']: 273 | for leaf in node.leaves(): 274 | n_tokens.append(leaf[0]) 275 | return n_tokens 276 | 277 | def _dump(self, tokens): 278 | response = { 279 | 'status': True, 280 | 'result': tokens 281 | } 282 | return response 283 | 284 | 285 | class NLTKLemmatize(object): 286 | """ 287 | NLTK Lemmatizer used: WordNetLemmatizer 288 | 289 | Accepts: 290 | /api/lemma?words=/ 291 | 292 | Query Parameters: 293 | 294 | * Mandatory: 295 | 1. words: 296 | type: string comma separated 297 | """ 298 | 299 | def __init__(self, options): 300 | self.options = options 301 | 302 | def lemma(self): 303 | lemma_obj = WordNetLemmatizer() 304 | words = self._clean(self.options['words']) 305 | result = [lemma_obj.lemmatize(word) for word in words] 306 | return self._dump(result) 307 | 308 | def _clean(self, words): 309 | return words.split(',') 310 | 311 | def _dump(self, result): 312 | response = { 313 | 'status': True, 314 | 'result': result 315 | } 316 | return response 317 | 318 | 319 | class NLTKSentiment(object): 320 | """ 321 | NLTK Sentiment Analyzer used: vader 322 | 323 | Accepts: 324 | /api/sentiment?sentence=/ 325 | 326 | Query Parameters: 327 | 328 | * Mandatory: 329 | 1. sentence: 330 | type: string 331 | """ 332 | 333 | def __init__(self, options): 334 | self.options = options 335 | 336 | def sentiment(self): 337 | sentiment_obj = SentimentIntensityAnalyzer() 338 | result = sentiment_obj.polarity_scores(self.options['sentence']) 339 | return self._dump(result) 340 | 341 | def _dump(self, result): 342 | response = { 343 | 'status': True, 344 | 'result': result 345 | } 346 | return response 347 | 348 | -------------------------------------------------------------------------------- /app/views.py: -------------------------------------------------------------------------------- 1 | from rest_framework.views import APIView 2 | from rest_framework.response import Response 3 | 4 | from util import NLTKStem, NLTKTokenize, NLTKTag, NLTKner, NLTKLemmatize, \ 5 | NLTKSentiment 6 | 7 | 8 | class HomeView(APIView): 9 | """ 10 | Landing Page View 11 | """ 12 | 13 | def get(self, request): 14 | res = { 15 | 'examples': [ 16 | '/api/stem?words=', 17 | '/api/stem?words=&stemmer=', 18 | '/api/stem?words=&stemmer=snowball&language&ignore_stopwords=', 19 | '/api/tokenize?sentence=', 20 | '/api/tokenize?sentence=&tokenizer=', 21 | '/api/tag?sentence=', 22 | '/api/tag?sentence=&tagger=', 23 | '/api/tag?sentence=&tagger=&train=', 24 | '/api/tag?sentence=&tokenizer=', 25 | '/api/ner?sentence=', 26 | '/api/ner?sentence=&tokenizer=', 27 | '/api/sentiment?sentence=' 28 | ], 29 | 'message': 'see app/util.py for details', 30 | 'repository': 'https://github.com/vipul-sharma20/nltk-api-server', 31 | } 32 | return Response(res) 33 | 34 | 35 | class StemView(APIView): 36 | """ 37 | View for Stemming words 38 | """ 39 | 40 | def get(self, request): 41 | data = request.GET 42 | stem_obj = NLTKStem(data) 43 | res = stem_obj.stem() 44 | 45 | return Response(res) 46 | 47 | 48 | class TokenizeView(APIView): 49 | """ 50 | View for Tokenizing strings 51 | """ 52 | 53 | def get(self, request): 54 | data = request.GET 55 | tokenize_obj = NLTKTokenize(data) 56 | res = tokenize_obj.tokenize() 57 | 58 | return Response(res) 59 | 60 | 61 | class POSTagView(APIView): 62 | """ 63 | View for Part Of Speech tagging 64 | """ 65 | 66 | def get(self, request): 67 | data = request.GET 68 | pos_obj = NLTKTag(data) 69 | res = pos_obj.pos_tag() 70 | 71 | return Response(res) 72 | 73 | 74 | class NERView(APIView): 75 | """ 76 | View for Named Entity Recognition 77 | """ 78 | 79 | def get(self, request): 80 | data = request.GET 81 | ner_obj = NLTKner(data) 82 | res = ner_obj.ner() 83 | 84 | return Response(res) 85 | 86 | 87 | class LemmatizeView(APIView): 88 | """ 89 | View for Lemmatization 90 | """ 91 | 92 | def get(self, request): 93 | data = request.GET 94 | lemma_obj = NLTKLemmatize(data) 95 | res = lemma_obj.lemma() 96 | 97 | return Response(res) 98 | 99 | 100 | class SentimentView(APIView): 101 | """ 102 | View for Sentiment Analysis 103 | """ 104 | 105 | def get(self, request): 106 | data = request.GET 107 | senti_obj = NLTKSentiment(data) 108 | res = senti_obj.sentiment() 109 | 110 | return Response(res) 111 | 112 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nltk_api.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /nltk_api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vipul-sharma20/nltk-api-server/b4859b687d66abfa3cea04dd1d7521c89f9485d8/nltk_api/__init__.py -------------------------------------------------------------------------------- /nltk_api/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for nltk_api project. 3 | """ 4 | 5 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 6 | import os 7 | 8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 9 | 10 | # SECURITY WARNING: keep the secret key used in production secret! 11 | SECRET_KEY = '3!4c6)^ggxuf!gbk$q2*ea52*oe+%)d%+!(td9mm5^6o_)bir2' 12 | 13 | # SECURITY WARNING: don't run with debug turned on in production! 14 | DEBUG = True 15 | 16 | ALLOWED_HOSTS = [] 17 | 18 | 19 | # Application definition 20 | 21 | INSTALLED_APPS = ( 22 | 'django.contrib.admin', 23 | 'django.contrib.auth', 24 | 'django.contrib.contenttypes', 25 | 'django.contrib.sessions', 26 | 'django.contrib.messages', 27 | 'django.contrib.staticfiles', 28 | 'rest_framework', 29 | ) 30 | 31 | MIDDLEWARE_CLASSES = ( 32 | 'django.contrib.sessions.middleware.SessionMiddleware', 33 | 'django.middleware.common.CommonMiddleware', 34 | 'django.middleware.csrf.CsrfViewMiddleware', 35 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 36 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 37 | 'django.contrib.messages.middleware.MessageMiddleware', 38 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 39 | 'django.middleware.security.SecurityMiddleware', 40 | 'app.middleware.validate_middleware.ValidateParameterMiddleware', 41 | ) 42 | 43 | ROOT_URLCONF = 'nltk_api.urls' 44 | 45 | TEMPLATES = [ 46 | { 47 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 48 | 'DIRS': [os.path.join(BASE_DIR, 'app/templates/rest_framework')], 49 | 'APP_DIRS': True, 50 | 'OPTIONS': { 51 | 'context_processors': [ 52 | 'django.template.context_processors.debug', 53 | 'django.template.context_processors.request', 54 | 'django.contrib.auth.context_processors.auth', 55 | 'django.contrib.messages.context_processors.messages', 56 | ], 57 | }, 58 | }, 59 | ] 60 | 61 | WSGI_APPLICATION = 'nltk_api.wsgi.application' 62 | 63 | 64 | # Database 65 | 66 | DATABASES = { 67 | 'default': { 68 | 'ENGINE': 'django.db.backends.sqlite3', 69 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 70 | } 71 | } 72 | 73 | # REST frame 74 | #REST_FRAMEWORK = { 75 | #'DEFAULT_PERMISSION_CLASSES': [ 76 | # 'rest_framework.permissions.DjangoModelPermissionsOrAnonReadOnly' 77 | #] 78 | #} 79 | 80 | REST_FRAMEWORK = { 81 | 'DEFAULT_RENDERER_CLASSES': ( 82 | 'rest_framework.renderers.JSONRenderer', 83 | 'rest_framework.renderers.BrowsableAPIRenderer', 84 | ), 85 | 'DEFAULT_THROTTLE_CLASSES': ( 86 | 'rest_framework.throttling.AnonRateThrottle', 87 | ), 88 | 'DEFAULT_THROTTLE_RATES': { 89 | 'anon': '100/day', 90 | } 91 | } 92 | 93 | # Internationalization 94 | 95 | LANGUAGE_CODE = 'en-us' 96 | 97 | TIME_ZONE = 'UTC' 98 | 99 | USE_I18N = True 100 | 101 | USE_L10N = True 102 | 103 | USE_TZ = True 104 | 105 | 106 | # Static files (CSS, JavaScript, Images) 107 | 108 | STATIC_URL = '/static/' 109 | -------------------------------------------------------------------------------- /nltk_api/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | nltk_api URL Configuration 3 | """ 4 | import app 5 | from app import urls 6 | 7 | from django.conf.urls import url, include 8 | from django.contrib import admin 9 | 10 | from rest_framework import routers 11 | 12 | router = routers.DefaultRouter() 13 | 14 | urlpatterns = [ 15 | url(r'^', include(router.urls)), 16 | url(r'^', include(app.urls)), 17 | url(r'^admin/', include(admin.site.urls)), 18 | ] 19 | 20 | -------------------------------------------------------------------------------- /nltk_api/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for nltk_api project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nltk_api.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.8 2 | djangorestframework==3.5.3 3 | httpie==0.9.6 4 | nltk==3.2.1 5 | Pygments==2.1.3 6 | requests==2.12.1 7 | twython==3.4.0 8 | numpy==1.11.2 9 | --------------------------------------------------------------------------------