├── Dockerfile
├── README.md
└── python
    ├── CLAAS_public.py
    └── requirements.txt


/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/anaconda:4.4.0
2 | MAINTAINER Vivek Kalyanarangan, https://machinelearningblogs.com/about/
3 | COPY python/ /usr/local/python/
4 | EXPOSE 8180
5 | WORKDIR /usr/local/python/
6 | RUN pip install -r requirements.txt \
7 |     && python -m nltk.downloader averaged_perceptron_tagger
8 | CMD python CLAAS_public.py
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text-Clustering-API
 2 | Implementation of a text clustering algorithm using Kmeans clustering in order to derive quick insights from unstructured text.
 3 | Please check the below links for details - 
 4 | 
 5 | + https://machinelearningblogs.com/2017/01/26/text-clustering-get-quick-insights-from-unstructured-data/
 6 | + https://machinelearningblogs.com/2017/06/23/text-clustering-get-quick-insights-unstructured-data-2/
 7 | 
 8 | ## Docker Setup
 9 | 0. Install [Docker](https://docs.docker.com/engine/installation/)
10 | 1. Run `git clone https://github.com/vivekkalyanarangan30/Text-Clustering-API`
11 | 2. Open docker terminal and navigate to `/path/to/Text-Clustering-API`
12 | 3. Run `docker build -t clustering-api .`
13 | 4. Run `docker run -p 8180:8180 clustering-api`
14 | 5. Access http://192.168.99.100:8180/apidocs/index.html from your browser [assuming you are on windows and docker-machine has that IP. Otherwise just use localhost]
15 | 
16 | ## Native Setup
17 | 1. Anaconda distribution of python 2.7
18 | 2. `pip install -r requirements.txt`
19 | 3. Some dependencies from *nltk* (`nltk.download()` from python console and download averaged perceptron tagger)
20 | 
21 | ### Run it
22 | 1. Place the script in any folder
23 | 2. Open command prompt and navigate to that folder
24 | 3. Type "python CLAAS.py"and hit enter
25 | 4. Go over to http://localhost:8180/apidocs/index.html in your browser (preferably Chrome) and start using.
26 | 


--------------------------------------------------------------------------------
/python/CLAAS_public.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun May 08 23:43:37 2016
  4 | 
  5 | @author: Vivek Kalyanarangan
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import string
 10 | from sklearn.feature_extraction.text import TfidfVectorizer
 11 | from sklearn.feature_extraction.text import CountVectorizer
 12 | from stemming.porter2 import stem
 13 | from sklearn.metrics.pairwise import cosine_similarity
 14 | import numpy as np
 15 | from scipy.sparse import hstack
 16 | import flask
 17 | from sklearn import decomposition
 18 | from nltk.tag.perceptron import PerceptronTagger
 19 | import nltk
 20 | import re
 21 | import pickle
 22 | import os
 23 | import datetime
 24 | import time
 25 | import requests
 26 | import httplib2
 27 | import scipy
 28 | from scipy.sparse import hstack,csr_matrix,coo_matrix
 29 | from sklearn.cluster import KMeans
 30 | from nltk.stem import WordNetLemmatizer
 31 | import StringIO
 32 | 
 33 | from flask import Flask, jsonify, request, Response
 34 | from flasgger import Swagger
 35 | 
 36 | app = Flask(__name__)
 37 | app.config['SWAGGER'] = {
 38 |     "swagger_version": "2.0",
 39 |     # headers are optional, the following are default
 40 |     # "headers": [
 41 |     #     ('Access-Control-Allow-Origin', '*'),
 42 |     #     ('Access-Control-Allow-Headers', "Authorization, Content-Type"),
 43 |     #     ('Access-Control-Expose-Headers', "Authorization"),
 44 |     #     ('Access-Control-Allow-Methods', "GET, POST, PUT, DELETE, OPTIONS"),
 45 |     #     ('Access-Control-Allow-Credentials', "true"),
 46 |     #     ('Access-Control-Max-Age', 60 * 60 * 24 * 20),
 47 |     # ],
 48 |     # another optional settings
 49 |     # "url_prefix": "swaggerdocs",
 50 |     # "subdomain": "docs.mysite,com",
 51 |     # specs are also optional if not set /spec is registered exposing all views
 52 |     "specs": [
 53 |         {
 54 |             "version": "2.0.0",
 55 |             "title": "Clustering API",
 56 |             "endpoint": 'v2_spec',
 57 |             "route": '/v2/spec',
 58 |             "description": "This API will help you bin individual data points into groups in a guided and unguided manner"
 59 |             # rule_filter is optional
 60 |             # it is a callable to filter the views to extract
 61 | 
 62 |             # "rule_filter": lambda rule: rule.endpoint.startswith(
 63 |             #    'should_be_v1_only'
 64 |             # )
 65 |         }
 66 |     ]
 67 | }
 68 | Swagger(app)
 69 | 
 70 | tagger = PerceptronTagger()
 71 | tagset = None
 72 | stop = nltk.corpus.stopwords
 73 | wordnet_lemmatizer = WordNetLemmatizer()
 74 | 
 75 | grammar = '''REMOVE: {<PRP><VBP>?<VBG><TO>?}
 76 |                          {<PRP><MD><VB><TO>}
 77 |                          {<VBZ><DT><JJ>}
 78 |                          {<MD><DT><NN>}
 79 |                          {<NNP><PRP><VBP>}
 80 |                          {<MD><PRP>}
 81 |                          {<NNP><PRP><VBP>}
 82 |                          {<WDT><MD>}
 83 |                          {<PRP><VBP><VBG><VB><DT>}
 84 |                          {<VBZ><DT><JJ>}
 85 |                          {<VBZ><EX><NN><PRP><VBP><TO><VB>}
 86 |                          {<DT><VBZ>}
 87 |                          {<PRP><VBP><VBG><TO>}
 88 |                          {<MD><VB><TO><VB>}
 89 |                          {<VBZ><EX><DT>}
 90 |                          {<VB><TO>}
 91 |                          {<VBZ>}
 92 |                          {<DT>}
 93 |                          {<EX>}
 94 |                          {<PRP><VBP>}
 95 |                          {<CD>}
 96 |                          {<PRP\$>}
 97 |                          {<PRP>}
 98 |                          {<TO>}
 99 |                          {<IN>}
100 |                          {<VBP>}
101 |                          {<CC>}
102 |               '''
103 | 
104 | def stem_doc(x):
105 |     red_text = [stem(word.strip()) for word in x.split(" ") if word.strip()!='']
106 |     return ' '.join(red_text)
107 | 
108 | def lem(x):
109 |     try:
110 |         return wordnet_lemmatizer.lemmatize(x,pos='v')
111 |     except:
112 |         return x
113 |         
114 | def remove_url(x):
115 |     return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x)
116 | 
117 | def cleanse_text(text):
118 |     if text:
119 |         text = remove_url(text)
120 |         addl_txt = addl_clean_words(text)
121 |         red_text = clean_words(addl_txt)
122 |         
123 |         no_gram = red_text
124 |         try:
125 |             no_gram = remove_grammar(red_text)
126 |         except:
127 |             no_gram = red_text
128 |     
129 |         #clean = ' '.join([i for i in no_gram.split() if i not in stop])
130 |         if no_gram:
131 |             clean = ' '.join([i for i in no_gram.split()])
132 |             red_text = [lem(word) for word in clean.split(" ")]
133 |             red_text = [stem(word) for word in clean.split(" ")]
134 |             return clean_words(' '.join(red_text))
135 |         else:
136 |             return no_gram
137 |     else:
138 |         return text
139 | 
140 | def cleanse_text_guided(text):
141 |     if text:
142 |         text = remove_url(text)
143 |         addl_txt = addl_clean_words(text)
144 |         red_text = clean_words_guided(addl_txt)
145 |         
146 |         no_gram = red_text
147 |         try:
148 |             no_gram = remove_grammar(red_text)
149 |         except:
150 |             no_gram = red_text
151 |     
152 |         #clean = ' '.join([i for i in no_gram.split() if i not in stop])
153 |         if no_gram:
154 |             clean = ' '.join([i for i in no_gram.split()])
155 |             red_text = [lem(word) for word in clean.split(" ")]
156 |             red_text = [stem(word) for word in clean.split(" ")]
157 |             return clean_words(' '.join(red_text))
158 |         else:
159 |             return no_gram
160 |     else:
161 |         return text
162 | 
163 |         
164 | def addl_clean_words(words):
165 |     # any additional data pre-processing
166 |     words = words.replace('can\'t','cannot')
167 |     words = words.replace('won\'t','would not')
168 |     words = words.replace('doesn\'t','does not')
169 |     return words
170 |     
171 | def clean_words(words):
172 |     if words:
173 |         words = remove_email(words)
174 |         words = words.replace('\t',' ')
175 |         words = words.replace(',',' ')
176 |         words = words.replace(':',' ')
177 |         words = words.replace(';',' ')
178 |         words = words.replace('=',' ')
179 |         #words = words.replace('\x92','') # apostrophe encoding
180 |         words = words.replace('\x08','\\b') # \b is being treated as backspace
181 |         #words = ''.join([i for i in words if not i.isdigit()])
182 |         words = words.replace('_',' ')
183 |         words = words.replace('(',' ')
184 |         words = words.replace(')',' ')
185 |         words = words.replace('+',' ')
186 |         words = words.replace('-',' ')
187 |         words = words.replace('`',' ')
188 |         words = words.replace('\'',' ')
189 |         words = words.replace('.',' ')
190 |         words = words.replace('#',' ')
191 |         words = words.replace('/',' ')
192 |         words = words.replace('_',' ')
193 |         words = words.replace('"',' ')
194 |         return words.strip()
195 |     return words
196 | 
197 | def clean_words_guided(words):
198 |     if words:
199 |         words = remove_email(words)
200 |         words = words.replace('\t',' ')
201 |         words = words.replace(',',' ')
202 |         words = words.replace(':',' ')
203 |         words = words.replace(';',' ')
204 |         words = words.replace('=',' ')
205 |         #words = words.replace('\x92','') # apostrophe encoding
206 |         words = words.replace('\x08','\\b') # \b is being treated as backspace
207 |         #words = ''.join([i for i in words if not i.isdigit()])
208 |         words = words.replace('_',' ')
209 |         words = words.replace('(',' ')
210 |         words = words.replace(')',' ')
211 |         words = words.replace('+',' ')
212 |         words = words.replace('-',' ')
213 |         words = words.replace('`',' ')
214 |         words = words.replace('\'',' ')
215 |         words = words.replace('.',' ')
216 |         words = words.replace('#',' ')
217 |         words = words.replace('/',' ')
218 |         words = words.replace('_',' ')
219 |         words = words.replace('"',' ')
220 |         words = words.replace("'",' ')
221 |         return words.strip()
222 |     return words
223 | 
224 |     
225 | def remove_grammar(review):
226 |     sentences = nltk.sent_tokenize(review)
227 |     sentences = [nltk.word_tokenize(sent) for sent in sentences]
228 |     result_review = []
229 |     for sentence in sentences:
230 |         if sentences.strip():
231 |             tagged_review = nltk.tag._pos_tag(sentence, tagset, tagger)
232 |             cp = nltk.RegexpParser(grammar)
233 |             result = cp.parse(tagged_review)
234 |             result_review.append(traverseTree(result))
235 |     return ''.join([word for word in result_review])
236 |     
237 | # Remove email
238 | def remove_email(words):
239 |     mod_words = ''
240 |     if words:
241 |         if words.strip():
242 |             for word in words.split(' '):
243 |                 if (word.strip().lower()=='email') or (word.strip().lower()=='phn') or (word.strip().lower()=='phone') or (len(word.strip())<=1):
244 |                     continue
245 |                 elif not re.match(r"[^@]+@[^@]+\.[^@]+", word.lower()):
246 |                     mod_words = mod_words+' '+word
247 |                 #else:   
248 |     else:
249 |         return words
250 |     return mod_words.strip()
251 |     
252 | def traverseTree(tree):
253 |     imp_words = []
254 |     for n in tree:
255 |         if not isinstance(n, nltk.tree.Tree):               
256 |             if isinstance(n, tuple):
257 |                 imp_words.append(n[0])
258 |             else:
259 |                 continue
260 |     return ' '.join([word for word in imp_words])
261 | 
262 | def euc_dist(a,b):
263 |     sum_ = scipy.sparse.csr_matrix.sum(a.multiply(b),axis=1)
264 |     return sum_
265 | 
266 | @app.route('/unguided_cluster', methods=['POST'])
267 | def index():
268 |     """
269 |     This API will help you generate clusters based on keywords present in unstructured text
270 |     Call this api passing the following parameters - 
271 |         Dataset Path - Choosing the file
272 |         Column Name based on which clustering needs to be done
273 |         Number of Clusters
274 |     Sample URL: http://localhost:8180/cluster/clusters.csv?dataset=\\\\W1400368\\c$\\Users\\VK046010\\Documents\\Python%20Scripts\\RevCycle_PatientAcc.csv&ext=csv&col=SR_SUM_TXT&no_of_clusters=100
275 |     ---
276 |     tags:
277 |       - Clustering API
278 |     parameters:
279 |       - name: dataset
280 |         in: formData
281 |         type: file
282 |         required: true
283 |         description: The fully qualified path of the dataset without the extension.
284 |       - name: col
285 |         in: query
286 |         type: string
287 |         required: true
288 |         description: The column name on which the clustering needs to be done
289 |       - name: no_of_clusters
290 |         in: query
291 |         type: integer
292 |         required: true
293 |         description: The number of clusters
294 |     """
295 |     #file_ = request.args.get('upload')
296 |     #print request.files
297 |     data = pd.read_csv(request.files['dataset'])
298 |     #loc = request.args.get('dataset')
299 |     #ext = loc.split('.')[-1]
300 |     #ext='csv'
301 |     #if 'ext' in request.args:
302 |     #    ext = request.args.get('ext')
303 |     
304 |     unstructure = ''
305 |     if 'col' in request.args:
306 |         unstructure = request.args.get('col')
307 |         print(unstructure)
308 |     no_of_clusters = 10
309 |     if 'no_of_clusters' in request.args:
310 |         no_of_clusters = int(request.args.get('no_of_clusters'))
311 |     #data=pd.DataFrame()
312 | #    if ext=='csv':
313 | #        data = pd.read_csv(loc)
314 | #    elif ext=='xlsx':
315 | #        data = pd.read_excel(loc)
316 | #    elif ext=='xls':
317 | #        data = pd.read_excel(loc)
318 |         
319 |     data = data.fillna('NULL')
320 |     data['clean_sum'] = data[unstructure].apply(lambda x: cleanse_text(x))
321 |     
322 |     vectorizer = CountVectorizer(analyzer='word',stop_words='english',decode_error='ignore',binary=True)
323 |     #vectorizer.fit(data[unstructure])    
324 |     
325 |     counts = vectorizer.fit_transform(data['clean_sum'])
326 |     
327 |     kmeans = KMeans(n_clusters=no_of_clusters,n_jobs=-1)
328 |     
329 |     data['cluster_num'] = kmeans.fit_predict(counts)
330 |     data = data.drop(['clean_sum'],axis=1)
331 |     output = StringIO.StringIO()
332 |     data.to_csv(output,index=False)
333 |     
334 |     clusters = []
335 |     for i in range(np.shape(kmeans.cluster_centers_)[0]):
336 |         data_cluster = pd.concat([pd.Series(vectorizer.get_feature_names()),pd.DataFrame(kmeans.cluster_centers_[i])],axis=1)
337 |         data_cluster.columns = ['keywords','weights']
338 |         data_cluster = data_cluster.sort_values(by=['weights'],ascending=False)
339 |         data_clust = data_cluster.head(n=10)['keywords'].tolist()
340 |         clusters.append(data_clust)
341 |         #print data_cluster.head(n=10)['keywords']
342 |     #data_CLUSTERS.to_csv('output_full.csv',index=False)
343 |     pd.DataFrame(clusters).to_csv('keywords_.csv')
344 |     data.to_csv('Q2.csv',index=False)
345 |     
346 |     resp = Response(output.getvalue(), mimetype="text/csv")
347 |     resp.headers["Accept"] = "text/csv"
348 |     resp.headers['Access-Control-Allow-Origin'] = '*'
349 |     resp.headers["Content-Disposition"] = "attachment; filename=clusters.csv"
350 |     return resp
351 | 
352 | def phrase_in(x,phrase):
353 |     if phrase in x:
354 |         return True
355 |     else:
356 |         return None
357 | 
358 | @app.route('/guided_cluster', methods=['POST'])
359 | def index_guided():
360 |     """
361 |     This API will help you generate clusters based on keywords provided by you
362 |     Call this api passing the following parameters - 
363 |         Dataset - The data you want to cluster
364 |         Column Name based on which clustering needs to be done
365 |         Comma separated values of the keywords
366 |     ---
367 |     tags:
368 |       - Clustering API
369 |     parameters:
370 |       - name: dataset
371 |         in: formData
372 |         type: file
373 |         required: true
374 |         description: The dataset
375 |       - name: col
376 |         in: query
377 |         type: string
378 |         required: true
379 |         description: The column name based on which the clustering needs to be done
380 |       - name: phrases
381 |         in: formData
382 |         type: file
383 |         required: true
384 |         description: The keywords for clustering in a single column in a csv
385 |       
386 |     """
387 |     #file_ = request.args.get('upload')
388 |     #print request.files
389 |     data = pd.read_csv(request.files['dataset'])
390 |     data_keywords = pd.read_csv(request.files['phrases'],header=None)
391 |     #loc = request.args.get('dataset')
392 |     #ext = loc.split('.')[-1]
393 |     #ext='csv'
394 |     #if 'ext' in request.args:
395 |     #    ext = request.args.get('ext')
396 |     
397 |     unstructure = ''
398 |     if 'col' in request.args:
399 |         unstructure = request.args.get('col')
400 |   
401 |     data = data.fillna('NULL')
402 |     data['clean_sum'] = data[unstructure].apply(lambda x: cleanse_text(x.lower()))
403 |     #data.to_csv('clean_dat.csv',index=False)
404 |     data_keywords = data_keywords.fillna('NULL')
405 |     data_keywords[data_keywords.columns[0]] = data_keywords[data_keywords.columns[0]].apply(lambda x: str(x).lower())
406 |     data_keywords['clean_keys'] = data_keywords[data_keywords.columns[0]].apply(lambda x: cleanse_text_guided(x))
407 |     vocab_keys = data_keywords['clean_keys'].drop_duplicates().tolist()
408 |     
409 |     counts = np.zeros(shape=(np.shape(data)[0],len(vocab_keys)))
410 |     data_counts = pd.DataFrame(counts,columns=vocab_keys)
411 |     for phrase in vocab_keys:
412 |         data_counts[phrase] = data['clean_sum'].apply(lambda x: phrase_in(x,phrase))
413 |     data = data.drop(['clean_sum'],axis=1)
414 |     data_output = pd.concat([data, data_counts], axis=1)
415 |     output = StringIO.StringIO()
416 |     data_output.to_csv(output,index=False)
417 |  
418 |     resp = Response(output.getvalue(), mimetype="text/csv")
419 |     resp.headers["Accept"] = "text/csv"
420 |     resp.headers['Access-Control-Allow-Origin'] = '*'
421 |     resp.headers["Content-Disposition"] = "attachment; filename=clusters.csv"
422 |     return resp
423 | 
424 | if __name__ == '__main__':
425 |     app.run(host='0.0.0.0',debug=True,port=8180,use_evalex=False,threaded=True)
426 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flasgger==0.8.1
3 | httplib2
4 | stemming
5 | 


--------------------------------------------------------------------------------