├── Autocomplete.ipynb
├── README.md
├── autocomplete_server.py
├── autocompleter.py
└── sample_conversations.json
/Autocomplete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": []
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "# Loading"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import autocompleter \n",
31 | "autocompl = autocompleter.Autocompleter()"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "load json file...\n",
44 | "(22264, 3)\n"
45 | ]
46 | },
47 | {
48 | "data": {
49 | "text/plain": [
50 | "((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))"
51 | ]
52 | },
53 | "execution_count": 2,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "df = autocompl.import_json(\"sample_conversations.json\")\n",
60 | "df.shape, df.columns"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "The file contains 22K conversations between a customer and a representative.\n",
68 | "For the purpose of this project, we are only interested in completing the threads of the representative."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/html": [
79 | "
\n",
80 | "\n",
93 | "
\n",
94 | " \n",
95 | " \n",
96 | " | \n",
97 | " IsFromCustomer | \n",
98 | " Text | \n",
99 | " index | \n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " 0 | \n",
105 | " True | \n",
106 | " Hi! I placed an order on your website and I ca... | \n",
107 | " 0 | \n",
108 | "
\n",
109 | " \n",
110 | " 1 | \n",
111 | " True | \n",
112 | " I think I used my email address to log in. | \n",
113 | " 0 | \n",
114 | "
\n",
115 | " \n",
116 | " 2 | \n",
117 | " True | \n",
118 | " My battery exploded! | \n",
119 | " 1 | \n",
120 | "
\n",
121 | " \n",
122 | " 3 | \n",
123 | " True | \n",
124 | " It's on fire, it's melting the carpet! | \n",
125 | " 1 | \n",
126 | "
\n",
127 | " \n",
128 | " 4 | \n",
129 | " True | \n",
130 | " What should I do! | \n",
131 | " 1 | \n",
132 | "
\n",
133 | " \n",
134 | "
\n",
135 | "
"
136 | ],
137 | "text/plain": [
138 | " IsFromCustomer Text index\n",
139 | "0 True Hi! I placed an order on your website and I ca... 0\n",
140 | "1 True I think I used my email address to log in. 0\n",
141 | "2 True My battery exploded! 1\n",
142 | "3 True It's on fire, it's melting the carpet! 1\n",
143 | "4 True What should I do! 1"
144 | ]
145 | },
146 | "execution_count": 4,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "df.head()"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "# Data Selection and Cleaning"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "The data is going to separate the threads from the customer and the representative, separate the sentenses based on the punctuation (we will keep the punctuation), the final text will be cleaned up with some light regex and only the sentense larger than 1 word will be kept."
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "Finally, since the representative has the tendency to ask the same question over and over again, the autocomplete is extremely useful by suggesting a complete sentense. In our case, we will count the number of occurence of the same sentense so we can use it as a feature later on and delete the duplicates."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 5,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "select representative threads...\n",
186 | "split sentenses on punctuation...\n",
187 | "Text Cleaning using simple regex...\n",
188 | "calculate nb words of sentenses...\n",
189 | "count occurence of sentenses...\n",
190 | "remove duplicates (keep last)...\n",
191 | "(8599, 5)\n"
192 | ]
193 | },
194 | {
195 | "data": {
196 | "text/plain": [
197 | "((8599, 5),\n",
198 | " Index(['IsFromCustomer', 'Text', 'index', 'nb_words', 'Counts'], dtype='object'))"
199 | ]
200 | },
201 | "execution_count": 5,
202 | "metadata": {},
203 | "output_type": "execute_result"
204 | }
205 | ],
206 | "source": [
207 | "new_df = autocompl.process_data(df)\n",
208 | "new_df.shape, new_df.columns"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "# Model and TFIDF matrix"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "A matrice of similarity is calculated based on the frequency of all the words in the data using tfidfvectorizer"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 6,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "name": "stdout",
232 | "output_type": "stream",
233 | "text": [
234 | "tfidf_matrice (8599, 99395)\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "# Ranking Function"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "Finally, the autocomplete is calculating the similarity between the sentense in the data and the prefix of the sentense written by the representative. As a weight feature, we chose to reorder using the frequency of the most common similar sentense."
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "examples of auto completions"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 28,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "What is your \n",
273 | " \n"
274 | ]
275 | },
276 | {
277 | "data": {
278 | "text/plain": [
279 | "['What is your account number?',\n",
280 | " 'What is your order number?',\n",
281 | " 'What is your phone number?']"
282 | ]
283 | },
284 | "execution_count": 28,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "prefix = 'What is your'\n",
291 | "\n",
292 | "print(prefix,\" \\n \")\n",
293 | "\n",
294 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 22,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "How can \n"
307 | ]
308 | },
309 | {
310 | "data": {
311 | "text/plain": [
312 | "['How can I help you?',\n",
313 | " 'How can I help you today?',\n",
314 | " 'Ok lets see how I can help']"
315 | ]
316 | },
317 | "execution_count": 22,
318 | "metadata": {},
319 | "output_type": "execute_result"
320 | }
321 | ],
322 | "source": [
323 | "prefix = 'How can'\n",
324 | "print(prefix,\" \")\n",
325 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 29,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "Let me \n"
338 | ]
339 | },
340 | {
341 | "data": {
342 | "text/plain": [
343 | "['Let me investigate', 'Let me assist you', 'Let me look']"
344 | ]
345 | },
346 | "execution_count": 29,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "prefix = 'Let me'\n",
353 | "print(prefix,\" \")\n",
354 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 31,
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "when was \n"
367 | ]
368 | },
369 | {
370 | "data": {
371 | "text/plain": [
372 | "['When was the last time you changed your password?',\n",
373 | " 'When was your flight scheduled for?',\n",
374 | " 'When was the last time you tried?']"
375 | ]
376 | },
377 | "execution_count": 31,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "prefix = 'when was'\n",
384 | "print(prefix,\" \")\n",
385 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "Now, without any uppercase and just with the important words..."
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 35,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "when time password \n"
405 | ]
406 | },
407 | {
408 | "data": {
409 | "text/plain": [
410 | "['When was the last time you changed your password?',\n",
411 | " 'When you select you password?',\n",
412 | " 'Take your time']"
413 | ]
414 | },
415 | "execution_count": 35,
416 | "metadata": {},
417 | "output_type": "execute_result"
418 | }
419 | ],
420 | "source": [
421 | "prefix = 'when time password'\n",
422 | "print(prefix,\" \")\n",
423 | "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": []
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "# Online Sources for this project"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 34,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": [
446 | "# https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation\n",
447 | "# kaggle google store competition for json read\n",
448 | "# https://www.kaggle.com/hamishdickson/weighted-word-autocomplete-using-star-wars-dataset"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "metadata": {},
455 | "outputs": [],
456 | "source": []
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": []
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {},
469 | "outputs": [],
470 | "source": [
471 | "#"
472 | ]
473 | }
474 | ],
475 | "metadata": {
476 | "kernelspec": {
477 | "display_name": "Python 3",
478 | "language": "python",
479 | "name": "python3"
480 | },
481 | "language_info": {
482 | "codemirror_mode": {
483 | "name": "ipython",
484 | "version": 3
485 | },
486 | "file_extension": ".py",
487 | "mimetype": "text/x-python",
488 | "name": "python",
489 | "nbconvert_exporter": "python",
490 | "pygments_lexer": "ipython3",
491 | "version": "3.6.3"
492 | }
493 | },
494 | "nbformat": 4,
495 | "nbformat_minor": 2
496 | }
497 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Autocomplete-NLP
2 | Sentense word nlp autocomplete
3 |
4 | **Imagine** that you were a representative replying to customer online and you are asking more or less the same questions over and over to your customer. Would you like to get automatic suggestions instead of typing the same thing again and again ?
5 |
6 | An **autocomplete** can be helpful, faster, convenient and also correct any grammatical / spelling error at the same time.
7 |
8 |
9 | **_Project_**:
10 |
11 | In the jupyter notebook in this project, we select an history of sentenses written by the representatives and the customer, format and correct them using a few regex rules and count them so we can estimate their frequency and likelyness to be useful again.
12 | After the calculation of a similarity matrix based on the sklearn **tfidf** tool (frequency and normalization of words), we use this matrix to calculate the similarity between the new few words written by the representative and the history of messages written in the past.
13 | The Autocomplete will recognize the closest sentenses and rank 3 final proposals:
14 |
15 | If you were to type: `What is your`,
16 | the tool would suggest:
17 | > What is your account number?,
18 | What is your order number?,
19 | What is your phone number?
20 |
21 |
22 | if you were to type: `Let me`
23 | the tool would suggest:
24 | > Let me investigate, Let me assist you, Let me look
25 |
26 | if you were to type without uppercase: `when was`
27 | > When was the last time you changed your password?,
28 | When was your flight scheduled for?,
29 | When was the last time you tried?
30 |
31 |
32 |
33 |
34 | **_Improvements_**:
35 | 1. clean up the "Mr. Smith" and "Ms. Smith" in the dataset
36 | 2. Match the letter to the words (spelling match) and then match the words to the representative sentenses history.
37 | 3. Build an evaluation of the results:
38 | - a. offline: using unseen conversations between the representative and the customer, input the prefix of the representative in the model and match to see if the actual representative sentense is part of the 3 ranked proposals.
39 | - b. online: count the number of time the representative actually select a proposals and count the number of time the representative decides to ignore them.
40 | 4. improve the system by first matching the customer sentenses to a topic id context in order to better predict the representative answers
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/autocomplete_server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, jsonify
2 | import autocompleter
3 |
4 | app = Flask(__name__)
5 | @app.route('/autocomplete')
6 |
7 | def autocomplete():
8 |
9 | """ Generate autocompletions given the query 'q' """
10 |
11 | q = request.args.get('q')
12 | completions = my_autocompleter.generate_completions(q, data_clean, model, tdidf_matrice)
13 | return jsonify({"Completions": completions})
14 |
15 | if __name__ == "__main__":
16 |
17 | my_autocompleter = autocompleter.Autocompleter()
18 | data_orig = my_autocompleter.import_json("sample_conversations.json")
19 | data_clean = my_autocompleter.process_data(data_orig)
20 | model, tdidf_matrice = my_autocompleter.calc_matrice(data_clean)
21 | print("ready to run...")
22 |
23 | app.run(host="0.0.0.0", port=80)
--------------------------------------------------------------------------------
/autocompleter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import json
4 | import numpy as np
5 | import pandas as pd
6 | from pandas.io.json import json_normalize
7 | import re
8 |
9 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
10 | from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
11 | from sklearn.metrics.pairwise import pairwise_distances
12 |
13 |
14 | DATA_DIR = './'
15 |
16 | def load_df(json_path='name.json'):
17 | """
18 | source: borrowed to kaggle competition gstore
19 | """
20 | df = pd.read_json(DATA_DIR+json_path)
21 |
22 | for column in ['Issues']:
23 | column_as_df = json_normalize(df[column])
24 | column_as_df.columns = [str(column+"_"+subcolumn) for subcolumn in column_as_df.columns]
25 | df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
26 |
27 | ## function allows to keep the index if we need to merge on the orginal data.
28 | df = pd.DataFrame([dict(y, index=i) for i, x in enumerate(df['Issues_Messages'].values.tolist()) for y in x])
29 |
30 | print(df.shape)
31 | return df
32 |
33 |
34 | def splitDataFrameList(df,target_column,separator):
35 |
36 | '''
37 | source: https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation
38 | df = dataframe to split,
39 | target_column = the column containing the values to split
40 | separator = the symbol used to perform the split
41 | returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
42 | The values in the other columns are duplicated across the newly divided rows.
43 | '''
44 | def split_text(line, separator):
45 | splited_line = [e+d for e in line.split(separator) if e]
46 | return splited_line
47 |
48 | def splitListToRows(row,row_accumulator,target_column,separator):
49 | split_row = row[target_column].split(separator)
50 | for s in split_row:
51 | new_row = row.to_dict()
52 | new_row[target_column] = s
53 | row_accumulator.append(new_row)
54 | new_rows = []
55 | df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
56 | new_df = pd.DataFrame(new_rows)
57 | return new_df
58 |
59 |
60 |
61 |
62 | class Autocompleter:
63 | def __init__(self):
64 | pass
65 |
66 | def import_json(self, json_filename):
67 | print("load json file...")
68 | df = load_df(json_filename)
69 | return df
70 |
71 | def process_data(self, new_df):
72 |
73 | print("select representative threads...")
74 | new_df = new_df[new_df.IsFromCustomer==False]
75 |
76 | print("split sentenses on punctuation...")
77 | for sep in ['. ',', ','? ', '! ', '; ']:
78 | new_df = splitDataFrameList(new_df, 'Text', sep)
79 |
80 | print("Text Cleaning using simple regex...")
81 | new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
82 | new_df['Text']=new_df['Text'].apply(lambda x: x.strip("."))
83 | new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
84 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I '))
85 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?'))
86 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!'))
87 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.'))
88 | new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok'))
89 | new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:])
90 | new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x)
91 |
92 | print("calculate nb words of sentenses...")
93 | new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' ')))
94 | new_df = new_df[new_df['nb_words']>2]
95 |
96 | print("count occurence of sentenses...")
97 | new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count')
98 |
99 | print("remove duplicates (keep last)...")
100 | new_df = new_df.drop_duplicates(subset=['Text'], keep='last')
101 |
102 | new_df = new_df.reset_index(drop=True)
103 | print(new_df.shape)
104 |
105 | return new_df
106 |
107 | def calc_matrice(self, df):
108 | # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
109 | model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0)
110 | tfidf_matrice = model_tf.fit_transform(df['Text'])
111 | print("tfidf_matrice ", tfidf_matrice.shape)
112 | return model_tf, tfidf_matrice
113 |
114 | def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
115 |
116 | prefix_string = str(prefix_string)
117 | new_df = data.reset_index(drop=True)
118 | weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values
119 |
120 | # tranform the string using the tfidf model
121 | tfidf_matrice_spelling = model_tf.transform([prefix_string])
122 | # calculate cosine_matrix
123 | cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling)
124 |
125 | #sort by order of similarity from 1 to 0:
126 | similarity_scores = list(enumerate(cosine_similarite))
127 | similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
128 | similarity_scores = similarity_scores[0:10]
129 |
130 | similarity_scores = [i for i in similarity_scores]
131 | similarity_indices = [i[0] for i in similarity_scores]
132 |
133 | #add weight to the potential results that had high frequency in orig data
134 | for i in range(len(similarity_scores)):
135 | similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i]
136 |
137 | similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
138 | similarity_scores = similarity_scores[0:3]
139 | similarity_indices_w = [i[0] for i in similarity_scores]
140 |
141 | return new_df.loc[similarity_indices_w]['Text'].tolist()
142 |
--------------------------------------------------------------------------------