├── Autocomplete.ipynb ├── README.md ├── autocomplete_server.py ├── autocompleter.py └── sample_conversations.json /Autocomplete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Loading" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import autocompleter \n", 31 | "autocompl = autocompleter.Autocompleter()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "load json file...\n", 44 | "(22264, 3)\n" 45 | ] 46 | }, 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))" 51 | ] 52 | }, 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "df = autocompl.import_json(\"sample_conversations.json\")\n", 60 | "df.shape, df.columns" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "The file contains 22K conversations between a customer and a representative.\n", 68 | "For the purpose of this project, we are only interested in completing the threads of the representative." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/html": [ 79 | "
\n", 80 | "\n", 93 | "\n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | "
IsFromCustomerTextindex
0TrueHi! I placed an order on your website and I ca...0
1TrueI think I used my email address to log in.0
2TrueMy battery exploded!1
3TrueIt's on fire, it's melting the carpet!1
4TrueWhat should I do!1
\n", 135 | "
" 136 | ], 137 | "text/plain": [ 138 | " IsFromCustomer Text index\n", 139 | "0 True Hi! I placed an order on your website and I ca... 0\n", 140 | "1 True I think I used my email address to log in. 0\n", 141 | "2 True My battery exploded! 1\n", 142 | "3 True It's on fire, it's melting the carpet! 1\n", 143 | "4 True What should I do! 1" 144 | ] 145 | }, 146 | "execution_count": 4, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "df.head()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "# Data Selection and Cleaning" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "The data is going to separate the threads from the customer and the representative, separate the sentenses based on the punctuation (we will keep the punctuation), the final text will be cleaned up with some light regex and only the sentense larger than 1 word will be kept." 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Finally, since the representative has the tendency to ask the same question over and over again, the autocomplete is extremely useful by suggesting a complete sentense. In our case, we will count the number of occurence of the same sentense so we can use it as a feature later on and delete the duplicates." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 5, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "select representative threads...\n", 186 | "split sentenses on punctuation...\n", 187 | "Text Cleaning using simple regex...\n", 188 | "calculate nb words of sentenses...\n", 189 | "count occurence of sentenses...\n", 190 | "remove duplicates (keep last)...\n", 191 | "(8599, 5)\n" 192 | ] 193 | }, 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "((8599, 5),\n", 198 | " Index(['IsFromCustomer', 'Text', 'index', 'nb_words', 'Counts'], dtype='object'))" 199 | ] 200 | }, 201 | "execution_count": 5, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "new_df = autocompl.process_data(df)\n", 208 | "new_df.shape, new_df.columns" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "# Model and TFIDF matrix" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "A matrice of similarity is calculated based on the frequency of all the words in the data using tfidfvectorizer" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 6, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "tfidf_matrice (8599, 99395)\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# Ranking Function" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "Finally, the autocomplete is calculating the similarity between the sentense in the data and the prefix of the sentense written by the representative. As a weight feature, we chose to reorder using the frequency of the most common similar sentense." 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "examples of auto completions" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 28, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "What is your \n", 273 | " \n" 274 | ] 275 | }, 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "['What is your account number?',\n", 280 | " 'What is your order number?',\n", 281 | " 'What is your phone number?']" 282 | ] 283 | }, 284 | "execution_count": 28, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "prefix = 'What is your'\n", 291 | "\n", 292 | "print(prefix,\" \\n \")\n", 293 | "\n", 294 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 22, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "How can \n" 307 | ] 308 | }, 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "['How can I help you?',\n", 313 | " 'How can I help you today?',\n", 314 | " 'Ok lets see how I can help']" 315 | ] 316 | }, 317 | "execution_count": 22, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "prefix = 'How can'\n", 324 | "print(prefix,\" \")\n", 325 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 29, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "Let me \n" 338 | ] 339 | }, 340 | { 341 | "data": { 342 | "text/plain": [ 343 | "['Let me investigate', 'Let me assist you', 'Let me look']" 344 | ] 345 | }, 346 | "execution_count": 29, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "prefix = 'Let me'\n", 353 | "print(prefix,\" \")\n", 354 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 31, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "when was \n" 367 | ] 368 | }, 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "['When was the last time you changed your password?',\n", 373 | " 'When was your flight scheduled for?',\n", 374 | " 'When was the last time you tried?']" 375 | ] 376 | }, 377 | "execution_count": 31, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "prefix = 'when was'\n", 384 | "print(prefix,\" \")\n", 385 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "Now, without any uppercase and just with the important words..." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 35, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "when time password \n" 405 | ] 406 | }, 407 | { 408 | "data": { 409 | "text/plain": [ 410 | "['When was the last time you changed your password?',\n", 411 | " 'When you select you password?',\n", 412 | " 'Take your time']" 413 | ] 414 | }, 415 | "execution_count": 35, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "prefix = 'when time password'\n", 422 | "print(prefix,\" \")\n", 423 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "# Online Sources for this project" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 34, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "# https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation\n", 447 | "# kaggle google store competition for json read\n", 448 | "# https://www.kaggle.com/hamishdickson/weighted-word-autocomplete-using-star-wars-dataset" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "#" 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.6.3" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 2 496 | } 497 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Autocomplete-NLP 2 | Sentense word nlp autocomplete 3 | 4 | **Imagine** that you were a representative replying to customer online and you are asking more or less the same questions over and over to your customer. Would you like to get automatic suggestions instead of typing the same thing again and again ? 5 | 6 | An **autocomplete** can be helpful, faster, convenient and also correct any grammatical / spelling error at the same time. 7 | 8 | 9 | **_Project_**: 10 | 11 | In the jupyter notebook in this project, we select an history of sentenses written by the representatives and the customer, format and correct them using a few regex rules and count them so we can estimate their frequency and likelyness to be useful again. 12 | After the calculation of a similarity matrix based on the sklearn **tfidf** tool (frequency and normalization of words), we use this matrix to calculate the similarity between the new few words written by the representative and the history of messages written in the past. 13 | The Autocomplete will recognize the closest sentenses and rank 3 final proposals: 14 | 15 | If you were to type: `What is your`, 16 | the tool would suggest: 17 | > What is your account number?, 18 | What is your order number?, 19 | What is your phone number? 20 | 21 | 22 | if you were to type: `Let me` 23 | the tool would suggest: 24 | > Let me investigate, Let me assist you, Let me look 25 | 26 | if you were to type without uppercase: `when was` 27 | > When was the last time you changed your password?, 28 | When was your flight scheduled for?, 29 | When was the last time you tried? 30 | 31 | 32 | 33 | 34 | **_Improvements_**: 35 | 1. clean up the "Mr. Smith" and "Ms. Smith" in the dataset 36 | 2. Match the letter to the words (spelling match) and then match the words to the representative sentenses history. 37 | 3. Build an evaluation of the results: 38 | - a. offline: using unseen conversations between the representative and the customer, input the prefix of the representative in the model and match to see if the actual representative sentense is part of the 3 ranked proposals. 39 | - b. online: count the number of time the representative actually select a proposals and count the number of time the representative decides to ignore them. 40 | 4. improve the system by first matching the customer sentenses to a topic id context in order to better predict the representative answers 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /autocomplete_server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | import autocompleter 3 | 4 | app = Flask(__name__) 5 | @app.route('/autocomplete') 6 | 7 | def autocomplete(): 8 | 9 | """ Generate autocompletions given the query 'q' """ 10 | 11 | q = request.args.get('q') 12 | completions = my_autocompleter.generate_completions(q, data_clean, model, tdidf_matrice) 13 | return jsonify({"Completions": completions}) 14 | 15 | if __name__ == "__main__": 16 | 17 | my_autocompleter = autocompleter.Autocompleter() 18 | data_orig = my_autocompleter.import_json("sample_conversations.json") 19 | data_clean = my_autocompleter.process_data(data_orig) 20 | model, tdidf_matrice = my_autocompleter.calc_matrice(data_clean) 21 | print("ready to run...") 22 | 23 | app.run(host="0.0.0.0", port=80) -------------------------------------------------------------------------------- /autocompleter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import json 4 | import numpy as np 5 | import pandas as pd 6 | from pandas.io.json import json_normalize 7 | import re 8 | 9 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 10 | from sklearn.metrics.pairwise import linear_kernel, cosine_similarity 11 | from sklearn.metrics.pairwise import pairwise_distances 12 | 13 | 14 | DATA_DIR = './' 15 | 16 | def load_df(json_path='name.json'): 17 | """ 18 | source: borrowed to kaggle competition gstore 19 | """ 20 | df = pd.read_json(DATA_DIR+json_path) 21 | 22 | for column in ['Issues']: 23 | column_as_df = json_normalize(df[column]) 24 | column_as_df.columns = [str(column+"_"+subcolumn) for subcolumn in column_as_df.columns] 25 | df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True) 26 | 27 | ## function allows to keep the index if we need to merge on the orginal data. 28 | df = pd.DataFrame([dict(y, index=i) for i, x in enumerate(df['Issues_Messages'].values.tolist()) for y in x]) 29 | 30 | print(df.shape) 31 | return df 32 | 33 | 34 | def splitDataFrameList(df,target_column,separator): 35 | 36 | ''' 37 | source: https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation 38 | df = dataframe to split, 39 | target_column = the column containing the values to split 40 | separator = the symbol used to perform the split 41 | returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 42 | The values in the other columns are duplicated across the newly divided rows. 43 | ''' 44 | def split_text(line, separator): 45 | splited_line = [e+d for e in line.split(separator) if e] 46 | return splited_line 47 | 48 | def splitListToRows(row,row_accumulator,target_column,separator): 49 | split_row = row[target_column].split(separator) 50 | for s in split_row: 51 | new_row = row.to_dict() 52 | new_row[target_column] = s 53 | row_accumulator.append(new_row) 54 | new_rows = [] 55 | df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator)) 56 | new_df = pd.DataFrame(new_rows) 57 | return new_df 58 | 59 | 60 | 61 | 62 | class Autocompleter: 63 | def __init__(self): 64 | pass 65 | 66 | def import_json(self, json_filename): 67 | print("load json file...") 68 | df = load_df(json_filename) 69 | return df 70 | 71 | def process_data(self, new_df): 72 | 73 | print("select representative threads...") 74 | new_df = new_df[new_df.IsFromCustomer==False] 75 | 76 | print("split sentenses on punctuation...") 77 | for sep in ['. ',', ','? ', '! ', '; ']: 78 | new_df = splitDataFrameList(new_df, 'Text', sep) 79 | 80 | print("Text Cleaning using simple regex...") 81 | new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split())) 82 | new_df['Text']=new_df['Text'].apply(lambda x: x.strip(".")) 83 | new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split())) 84 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I ')) 85 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?')) 86 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!')) 87 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.')) 88 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok')) 89 | new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:]) 90 | new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x) 91 | 92 | print("calculate nb words of sentenses...") 93 | new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' '))) 94 | new_df = new_df[new_df['nb_words']>2] 95 | 96 | print("count occurence of sentenses...") 97 | new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count') 98 | 99 | print("remove duplicates (keep last)...") 100 | new_df = new_df.drop_duplicates(subset=['Text'], keep='last') 101 | 102 | new_df = new_df.reset_index(drop=True) 103 | print(new_df.shape) 104 | 105 | return new_df 106 | 107 | def calc_matrice(self, df): 108 | # define tfidf parameter in order to count/vectorize the description vector and then normalize it. 109 | model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0) 110 | tfidf_matrice = model_tf.fit_transform(df['Text']) 111 | print("tfidf_matrice ", tfidf_matrice.shape) 112 | return model_tf, tfidf_matrice 113 | 114 | def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice): 115 | 116 | prefix_string = str(prefix_string) 117 | new_df = data.reset_index(drop=True) 118 | weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values 119 | 120 | # tranform the string using the tfidf model 121 | tfidf_matrice_spelling = model_tf.transform([prefix_string]) 122 | # calculate cosine_matrix 123 | cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling) 124 | 125 | #sort by order of similarity from 1 to 0: 126 | similarity_scores = list(enumerate(cosine_similarite)) 127 | similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) 128 | similarity_scores = similarity_scores[0:10] 129 | 130 | similarity_scores = [i for i in similarity_scores] 131 | similarity_indices = [i[0] for i in similarity_scores] 132 | 133 | #add weight to the potential results that had high frequency in orig data 134 | for i in range(len(similarity_scores)): 135 | similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i] 136 | 137 | similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) 138 | similarity_scores = similarity_scores[0:3] 139 | similarity_indices_w = [i[0] for i in similarity_scores] 140 | 141 | return new_df.loc[similarity_indices_w]['Text'].tolist() 142 | --------------------------------------------------------------------------------