├── Dockerfile ├── README.md └── python ├── CLAAS_public.py └── requirements.txt /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda:4.4.0 2 | MAINTAINER Vivek Kalyanarangan, https://machinelearningblogs.com/about/ 3 | COPY python/ /usr/local/python/ 4 | EXPOSE 8180 5 | WORKDIR /usr/local/python/ 6 | RUN pip install -r requirements.txt \ 7 | && python -m nltk.downloader averaged_perceptron_tagger 8 | CMD python CLAAS_public.py 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Clustering-API 2 | Implementation of a text clustering algorithm using Kmeans clustering in order to derive quick insights from unstructured text. 3 | Please check the below links for details - 4 | 5 | + https://machinelearningblogs.com/2017/01/26/text-clustering-get-quick-insights-from-unstructured-data/ 6 | + https://machinelearningblogs.com/2017/06/23/text-clustering-get-quick-insights-unstructured-data-2/ 7 | 8 | ## Docker Setup 9 | 0. Install [Docker](https://docs.docker.com/engine/installation/) 10 | 1. Run `git clone https://github.com/vivekkalyanarangan30/Text-Clustering-API` 11 | 2. Open docker terminal and navigate to `/path/to/Text-Clustering-API` 12 | 3. Run `docker build -t clustering-api .` 13 | 4. Run `docker run -p 8180:8180 clustering-api` 14 | 5. Access http://192.168.99.100:8180/apidocs/index.html from your browser [assuming you are on windows and docker-machine has that IP. Otherwise just use localhost] 15 | 16 | ## Native Setup 17 | 1. Anaconda distribution of python 2.7 18 | 2. `pip install -r requirements.txt` 19 | 3. Some dependencies from *nltk* (`nltk.download()` from python console and download averaged perceptron tagger) 20 | 21 | ### Run it 22 | 1. Place the script in any folder 23 | 2. Open command prompt and navigate to that folder 24 | 3. Type "python CLAAS.py"and hit enter 25 | 4. Go over to http://localhost:8180/apidocs/index.html in your browser (preferably Chrome) and start using. 26 | -------------------------------------------------------------------------------- /python/CLAAS_public.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun May 08 23:43:37 2016 4 | 5 | @author: Vivek Kalyanarangan 6 | """ 7 | 8 | import pandas as pd 9 | import string 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn.feature_extraction.text import CountVectorizer 12 | from stemming.porter2 import stem 13 | from sklearn.metrics.pairwise import cosine_similarity 14 | import numpy as np 15 | from scipy.sparse import hstack 16 | import flask 17 | from sklearn import decomposition 18 | from nltk.tag.perceptron import PerceptronTagger 19 | import nltk 20 | import re 21 | import pickle 22 | import os 23 | import datetime 24 | import time 25 | import requests 26 | import httplib2 27 | import scipy 28 | from scipy.sparse import hstack,csr_matrix,coo_matrix 29 | from sklearn.cluster import KMeans 30 | from nltk.stem import WordNetLemmatizer 31 | import StringIO 32 | 33 | from flask import Flask, jsonify, request, Response 34 | from flasgger import Swagger 35 | 36 | app = Flask(__name__) 37 | app.config['SWAGGER'] = { 38 | "swagger_version": "2.0", 39 | # headers are optional, the following are default 40 | # "headers": [ 41 | # ('Access-Control-Allow-Origin', '*'), 42 | # ('Access-Control-Allow-Headers', "Authorization, Content-Type"), 43 | # ('Access-Control-Expose-Headers', "Authorization"), 44 | # ('Access-Control-Allow-Methods', "GET, POST, PUT, DELETE, OPTIONS"), 45 | # ('Access-Control-Allow-Credentials', "true"), 46 | # ('Access-Control-Max-Age', 60 * 60 * 24 * 20), 47 | # ], 48 | # another optional settings 49 | # "url_prefix": "swaggerdocs", 50 | # "subdomain": "docs.mysite,com", 51 | # specs are also optional if not set /spec is registered exposing all views 52 | "specs": [ 53 | { 54 | "version": "2.0.0", 55 | "title": "Clustering API", 56 | "endpoint": 'v2_spec', 57 | "route": '/v2/spec', 58 | "description": "This API will help you bin individual data points into groups in a guided and unguided manner" 59 | # rule_filter is optional 60 | # it is a callable to filter the views to extract 61 | 62 | # "rule_filter": lambda rule: rule.endpoint.startswith( 63 | # 'should_be_v1_only' 64 | # ) 65 | } 66 | ] 67 | } 68 | Swagger(app) 69 | 70 | tagger = PerceptronTagger() 71 | tagset = None 72 | stop = nltk.corpus.stopwords 73 | wordnet_lemmatizer = WordNetLemmatizer() 74 | 75 | grammar = '''REMOVE: {??} 76 | {} 77 | {
} 78 | {
} 79 | {} 80 | {} 81 | {} 82 | {} 83 | {
} 84 | {
} 85 | {} 86 | {
} 87 | {} 88 | {} 89 | {
} 90 | {} 91 | {} 92 | {
} 93 | {} 94 | {} 95 | {} 96 | {} 97 | {} 98 | {} 99 | {} 100 | {} 101 | {} 102 | ''' 103 | 104 | def stem_doc(x): 105 | red_text = [stem(word.strip()) for word in x.split(" ") if word.strip()!=''] 106 | return ' '.join(red_text) 107 | 108 | def lem(x): 109 | try: 110 | return wordnet_lemmatizer.lemmatize(x,pos='v') 111 | except: 112 | return x 113 | 114 | def remove_url(x): 115 | return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x) 116 | 117 | def cleanse_text(text): 118 | if text: 119 | text = remove_url(text) 120 | addl_txt = addl_clean_words(text) 121 | red_text = clean_words(addl_txt) 122 | 123 | no_gram = red_text 124 | try: 125 | no_gram = remove_grammar(red_text) 126 | except: 127 | no_gram = red_text 128 | 129 | #clean = ' '.join([i for i in no_gram.split() if i not in stop]) 130 | if no_gram: 131 | clean = ' '.join([i for i in no_gram.split()]) 132 | red_text = [lem(word) for word in clean.split(" ")] 133 | red_text = [stem(word) for word in clean.split(" ")] 134 | return clean_words(' '.join(red_text)) 135 | else: 136 | return no_gram 137 | else: 138 | return text 139 | 140 | def cleanse_text_guided(text): 141 | if text: 142 | text = remove_url(text) 143 | addl_txt = addl_clean_words(text) 144 | red_text = clean_words_guided(addl_txt) 145 | 146 | no_gram = red_text 147 | try: 148 | no_gram = remove_grammar(red_text) 149 | except: 150 | no_gram = red_text 151 | 152 | #clean = ' '.join([i for i in no_gram.split() if i not in stop]) 153 | if no_gram: 154 | clean = ' '.join([i for i in no_gram.split()]) 155 | red_text = [lem(word) for word in clean.split(" ")] 156 | red_text = [stem(word) for word in clean.split(" ")] 157 | return clean_words(' '.join(red_text)) 158 | else: 159 | return no_gram 160 | else: 161 | return text 162 | 163 | 164 | def addl_clean_words(words): 165 | # any additional data pre-processing 166 | words = words.replace('can\'t','cannot') 167 | words = words.replace('won\'t','would not') 168 | words = words.replace('doesn\'t','does not') 169 | return words 170 | 171 | def clean_words(words): 172 | if words: 173 | words = remove_email(words) 174 | words = words.replace('\t',' ') 175 | words = words.replace(',',' ') 176 | words = words.replace(':',' ') 177 | words = words.replace(';',' ') 178 | words = words.replace('=',' ') 179 | #words = words.replace('\x92','') # apostrophe encoding 180 | words = words.replace('\x08','\\b') # \b is being treated as backspace 181 | #words = ''.join([i for i in words if not i.isdigit()]) 182 | words = words.replace('_',' ') 183 | words = words.replace('(',' ') 184 | words = words.replace(')',' ') 185 | words = words.replace('+',' ') 186 | words = words.replace('-',' ') 187 | words = words.replace('`',' ') 188 | words = words.replace('\'',' ') 189 | words = words.replace('.',' ') 190 | words = words.replace('#',' ') 191 | words = words.replace('/',' ') 192 | words = words.replace('_',' ') 193 | words = words.replace('"',' ') 194 | return words.strip() 195 | return words 196 | 197 | def clean_words_guided(words): 198 | if words: 199 | words = remove_email(words) 200 | words = words.replace('\t',' ') 201 | words = words.replace(',',' ') 202 | words = words.replace(':',' ') 203 | words = words.replace(';',' ') 204 | words = words.replace('=',' ') 205 | #words = words.replace('\x92','') # apostrophe encoding 206 | words = words.replace('\x08','\\b') # \b is being treated as backspace 207 | #words = ''.join([i for i in words if not i.isdigit()]) 208 | words = words.replace('_',' ') 209 | words = words.replace('(',' ') 210 | words = words.replace(')',' ') 211 | words = words.replace('+',' ') 212 | words = words.replace('-',' ') 213 | words = words.replace('`',' ') 214 | words = words.replace('\'',' ') 215 | words = words.replace('.',' ') 216 | words = words.replace('#',' ') 217 | words = words.replace('/',' ') 218 | words = words.replace('_',' ') 219 | words = words.replace('"',' ') 220 | words = words.replace("'",' ') 221 | return words.strip() 222 | return words 223 | 224 | 225 | def remove_grammar(review): 226 | sentences = nltk.sent_tokenize(review) 227 | sentences = [nltk.word_tokenize(sent) for sent in sentences] 228 | result_review = [] 229 | for sentence in sentences: 230 | if sentences.strip(): 231 | tagged_review = nltk.tag._pos_tag(sentence, tagset, tagger) 232 | cp = nltk.RegexpParser(grammar) 233 | result = cp.parse(tagged_review) 234 | result_review.append(traverseTree(result)) 235 | return ''.join([word for word in result_review]) 236 | 237 | # Remove email 238 | def remove_email(words): 239 | mod_words = '' 240 | if words: 241 | if words.strip(): 242 | for word in words.split(' '): 243 | if (word.strip().lower()=='email') or (word.strip().lower()=='phn') or (word.strip().lower()=='phone') or (len(word.strip())<=1): 244 | continue 245 | elif not re.match(r"[^@]+@[^@]+\.[^@]+", word.lower()): 246 | mod_words = mod_words+' '+word 247 | #else: 248 | else: 249 | return words 250 | return mod_words.strip() 251 | 252 | def traverseTree(tree): 253 | imp_words = [] 254 | for n in tree: 255 | if not isinstance(n, nltk.tree.Tree): 256 | if isinstance(n, tuple): 257 | imp_words.append(n[0]) 258 | else: 259 | continue 260 | return ' '.join([word for word in imp_words]) 261 | 262 | def euc_dist(a,b): 263 | sum_ = scipy.sparse.csr_matrix.sum(a.multiply(b),axis=1) 264 | return sum_ 265 | 266 | @app.route('/unguided_cluster', methods=['POST']) 267 | def index(): 268 | """ 269 | This API will help you generate clusters based on keywords present in unstructured text 270 | Call this api passing the following parameters - 271 | Dataset Path - Choosing the file 272 | Column Name based on which clustering needs to be done 273 | Number of Clusters 274 | Sample URL: http://localhost:8180/cluster/clusters.csv?dataset=\\\\W1400368\\c$\\Users\\VK046010\\Documents\\Python%20Scripts\\RevCycle_PatientAcc.csv&ext=csv&col=SR_SUM_TXT&no_of_clusters=100 275 | --- 276 | tags: 277 | - Clustering API 278 | parameters: 279 | - name: dataset 280 | in: formData 281 | type: file 282 | required: true 283 | description: The fully qualified path of the dataset without the extension. 284 | - name: col 285 | in: query 286 | type: string 287 | required: true 288 | description: The column name on which the clustering needs to be done 289 | - name: no_of_clusters 290 | in: query 291 | type: integer 292 | required: true 293 | description: The number of clusters 294 | """ 295 | #file_ = request.args.get('upload') 296 | #print request.files 297 | data = pd.read_csv(request.files['dataset']) 298 | #loc = request.args.get('dataset') 299 | #ext = loc.split('.')[-1] 300 | #ext='csv' 301 | #if 'ext' in request.args: 302 | # ext = request.args.get('ext') 303 | 304 | unstructure = '' 305 | if 'col' in request.args: 306 | unstructure = request.args.get('col') 307 | print(unstructure) 308 | no_of_clusters = 10 309 | if 'no_of_clusters' in request.args: 310 | no_of_clusters = int(request.args.get('no_of_clusters')) 311 | #data=pd.DataFrame() 312 | # if ext=='csv': 313 | # data = pd.read_csv(loc) 314 | # elif ext=='xlsx': 315 | # data = pd.read_excel(loc) 316 | # elif ext=='xls': 317 | # data = pd.read_excel(loc) 318 | 319 | data = data.fillna('NULL') 320 | data['clean_sum'] = data[unstructure].apply(lambda x: cleanse_text(x)) 321 | 322 | vectorizer = CountVectorizer(analyzer='word',stop_words='english',decode_error='ignore',binary=True) 323 | #vectorizer.fit(data[unstructure]) 324 | 325 | counts = vectorizer.fit_transform(data['clean_sum']) 326 | 327 | kmeans = KMeans(n_clusters=no_of_clusters,n_jobs=-1) 328 | 329 | data['cluster_num'] = kmeans.fit_predict(counts) 330 | data = data.drop(['clean_sum'],axis=1) 331 | output = StringIO.StringIO() 332 | data.to_csv(output,index=False) 333 | 334 | clusters = [] 335 | for i in range(np.shape(kmeans.cluster_centers_)[0]): 336 | data_cluster = pd.concat([pd.Series(vectorizer.get_feature_names()),pd.DataFrame(kmeans.cluster_centers_[i])],axis=1) 337 | data_cluster.columns = ['keywords','weights'] 338 | data_cluster = data_cluster.sort_values(by=['weights'],ascending=False) 339 | data_clust = data_cluster.head(n=10)['keywords'].tolist() 340 | clusters.append(data_clust) 341 | #print data_cluster.head(n=10)['keywords'] 342 | #data_CLUSTERS.to_csv('output_full.csv',index=False) 343 | pd.DataFrame(clusters).to_csv('keywords_.csv') 344 | data.to_csv('Q2.csv',index=False) 345 | 346 | resp = Response(output.getvalue(), mimetype="text/csv") 347 | resp.headers["Accept"] = "text/csv" 348 | resp.headers['Access-Control-Allow-Origin'] = '*' 349 | resp.headers["Content-Disposition"] = "attachment; filename=clusters.csv" 350 | return resp 351 | 352 | def phrase_in(x,phrase): 353 | if phrase in x: 354 | return True 355 | else: 356 | return None 357 | 358 | @app.route('/guided_cluster', methods=['POST']) 359 | def index_guided(): 360 | """ 361 | This API will help you generate clusters based on keywords provided by you 362 | Call this api passing the following parameters - 363 | Dataset - The data you want to cluster 364 | Column Name based on which clustering needs to be done 365 | Comma separated values of the keywords 366 | --- 367 | tags: 368 | - Clustering API 369 | parameters: 370 | - name: dataset 371 | in: formData 372 | type: file 373 | required: true 374 | description: The dataset 375 | - name: col 376 | in: query 377 | type: string 378 | required: true 379 | description: The column name based on which the clustering needs to be done 380 | - name: phrases 381 | in: formData 382 | type: file 383 | required: true 384 | description: The keywords for clustering in a single column in a csv 385 | 386 | """ 387 | #file_ = request.args.get('upload') 388 | #print request.files 389 | data = pd.read_csv(request.files['dataset']) 390 | data_keywords = pd.read_csv(request.files['phrases'],header=None) 391 | #loc = request.args.get('dataset') 392 | #ext = loc.split('.')[-1] 393 | #ext='csv' 394 | #if 'ext' in request.args: 395 | # ext = request.args.get('ext') 396 | 397 | unstructure = '' 398 | if 'col' in request.args: 399 | unstructure = request.args.get('col') 400 | 401 | data = data.fillna('NULL') 402 | data['clean_sum'] = data[unstructure].apply(lambda x: cleanse_text(x.lower())) 403 | #data.to_csv('clean_dat.csv',index=False) 404 | data_keywords = data_keywords.fillna('NULL') 405 | data_keywords[data_keywords.columns[0]] = data_keywords[data_keywords.columns[0]].apply(lambda x: str(x).lower()) 406 | data_keywords['clean_keys'] = data_keywords[data_keywords.columns[0]].apply(lambda x: cleanse_text_guided(x)) 407 | vocab_keys = data_keywords['clean_keys'].drop_duplicates().tolist() 408 | 409 | counts = np.zeros(shape=(np.shape(data)[0],len(vocab_keys))) 410 | data_counts = pd.DataFrame(counts,columns=vocab_keys) 411 | for phrase in vocab_keys: 412 | data_counts[phrase] = data['clean_sum'].apply(lambda x: phrase_in(x,phrase)) 413 | data = data.drop(['clean_sum'],axis=1) 414 | data_output = pd.concat([data, data_counts], axis=1) 415 | output = StringIO.StringIO() 416 | data_output.to_csv(output,index=False) 417 | 418 | resp = Response(output.getvalue(), mimetype="text/csv") 419 | resp.headers["Accept"] = "text/csv" 420 | resp.headers['Access-Control-Allow-Origin'] = '*' 421 | resp.headers["Content-Disposition"] = "attachment; filename=clusters.csv" 422 | return resp 423 | 424 | if __name__ == '__main__': 425 | app.run(host='0.0.0.0',debug=True,port=8180,use_evalex=False,threaded=True) 426 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flasgger==0.8.1 3 | httplib2 4 | stemming 5 | --------------------------------------------------------------------------------