├── Autocomplete.ipynb
├── README.md
├── autocomplete_server.py
├── autocompleter.py
└── sample_conversations.json


/Autocomplete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": []
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": []
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Loading"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import autocompleter \n",
 31 |     "autocompl = autocompleter.Autocompleter()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "load json file...\n",
 44 |       "(22264, 3)\n"
 45 |      ]
 46 |     },
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))"
 51 |       ]
 52 |      },
 53 |      "execution_count": 2,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "df = autocompl.import_json(\"sample_conversations.json\")\n",
 60 |     "df.shape, df.columns"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "The file contains 22K conversations between a customer and a representative.\n",
 68 |     "For the purpose of this project, we are only interested in completing the threads of the representative."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/html": [
 79 |        "<div>\n",
 80 |        "<style scoped>\n",
 81 |        "    .dataframe tbody tr th:only-of-type {\n",
 82 |        "        vertical-align: middle;\n",
 83 |        "    }\n",
 84 |        "\n",
 85 |        "    .dataframe tbody tr th {\n",
 86 |        "        vertical-align: top;\n",
 87 |        "    }\n",
 88 |        "\n",
 89 |        "    .dataframe thead th {\n",
 90 |        "        text-align: right;\n",
 91 |        "    }\n",
 92 |        "</style>\n",
 93 |        "<table border=\"1\" class=\"dataframe\">\n",
 94 |        "  <thead>\n",
 95 |        "    <tr style=\"text-align: right;\">\n",
 96 |        "      <th></th>\n",
 97 |        "      <th>IsFromCustomer</th>\n",
 98 |        "      <th>Text</th>\n",
 99 |        "      <th>index</th>\n",
100 |        "    </tr>\n",
101 |        "  </thead>\n",
102 |        "  <tbody>\n",
103 |        "    <tr>\n",
104 |        "      <th>0</th>\n",
105 |        "      <td>True</td>\n",
106 |        "      <td>Hi! I placed an order on your website and I ca...</td>\n",
107 |        "      <td>0</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>True</td>\n",
112 |        "      <td>I think I used my email address to log in.</td>\n",
113 |        "      <td>0</td>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>2</th>\n",
117 |        "      <td>True</td>\n",
118 |        "      <td>My battery exploded!</td>\n",
119 |        "      <td>1</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>3</th>\n",
123 |        "      <td>True</td>\n",
124 |        "      <td>It's on fire, it's melting the carpet!</td>\n",
125 |        "      <td>1</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>4</th>\n",
129 |        "      <td>True</td>\n",
130 |        "      <td>What should I do!</td>\n",
131 |        "      <td>1</td>\n",
132 |        "    </tr>\n",
133 |        "  </tbody>\n",
134 |        "</table>\n",
135 |        "</div>"
136 |       ],
137 |       "text/plain": [
138 |        "   IsFromCustomer                                               Text  index\n",
139 |        "0            True  Hi! I placed an order on your website and I ca...      0\n",
140 |        "1            True         I think I used my email address to log in.      0\n",
141 |        "2            True                               My battery exploded!      1\n",
142 |        "3            True             It's on fire, it's melting the carpet!      1\n",
143 |        "4            True                                  What should I do!      1"
144 |       ]
145 |      },
146 |      "execution_count": 4,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "df.head()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "# Data Selection and Cleaning"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "The data is going to separate the threads from the customer and the representative, separate the sentenses based on the punctuation (we will keep the punctuation), the final text will be cleaned up with some light regex and only the sentense larger than 1 word will be kept."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "Finally, since the representative has the tendency to ask the same question over and over again, the autocomplete is extremely useful by suggesting a complete sentense. In our case, we will count the number of occurence of the same sentense so we can use it as a feature later on and delete the duplicates."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 5,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "select representative threads...\n",
186 |       "split sentenses on punctuation...\n",
187 |       "Text Cleaning using simple regex...\n",
188 |       "calculate nb words of sentenses...\n",
189 |       "count occurence of sentenses...\n",
190 |       "remove duplicates (keep last)...\n",
191 |       "(8599, 5)\n"
192 |      ]
193 |     },
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "((8599, 5),\n",
198 |        " Index(['IsFromCustomer', 'Text', 'index', 'nb_words', 'Counts'], dtype='object'))"
199 |       ]
200 |      },
201 |      "execution_count": 5,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "new_df = autocompl.process_data(df)\n",
208 |     "new_df.shape, new_df.columns"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "# Model and TFIDF matrix"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "A matrice of similarity is calculated based on the frequency of all the words in the data using tfidfvectorizer"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 6,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "tfidf_matrice  (8599, 99395)\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "# Ranking Function"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "Finally, the autocomplete is calculating the similarity between the sentense in the data and the prefix of the sentense written by the representative. As a weight feature, we chose to reorder using the frequency of the most common similar sentense."
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "examples of auto completions"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 28,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "What is your     \n",
273 |       " \n"
274 |      ]
275 |     },
276 |     {
277 |      "data": {
278 |       "text/plain": [
279 |        "['What is your account number?',\n",
280 |        " 'What is your order number?',\n",
281 |        " 'What is your phone number?']"
282 |       ]
283 |      },
284 |      "execution_count": 28,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "prefix = 'What is your'\n",
291 |     "\n",
292 |     "print(prefix,\"    \\n \")\n",
293 |     "\n",
294 |     "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 22,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "How can      \n"
307 |      ]
308 |     },
309 |     {
310 |      "data": {
311 |       "text/plain": [
312 |        "['How can I help you?',\n",
313 |        " 'How can I help you today?',\n",
314 |        " 'Ok lets see how I can help']"
315 |       ]
316 |      },
317 |      "execution_count": 22,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "prefix = 'How can'\n",
324 |     "print(prefix,\"     \")\n",
325 |     "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 29,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "Let me      \n"
338 |      ]
339 |     },
340 |     {
341 |      "data": {
342 |       "text/plain": [
343 |        "['Let me investigate', 'Let me assist you', 'Let me look']"
344 |       ]
345 |      },
346 |      "execution_count": 29,
347 |      "metadata": {},
348 |      "output_type": "execute_result"
349 |     }
350 |    ],
351 |    "source": [
352 |     "prefix = 'Let me'\n",
353 |     "print(prefix,\"     \")\n",
354 |     "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 31,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "when was      \n"
367 |      ]
368 |     },
369 |     {
370 |      "data": {
371 |       "text/plain": [
372 |        "['When was the last time you changed your password?',\n",
373 |        " 'When was your flight scheduled for?',\n",
374 |        " 'When was the last time you tried?']"
375 |       ]
376 |      },
377 |      "execution_count": 31,
378 |      "metadata": {},
379 |      "output_type": "execute_result"
380 |     }
381 |    ],
382 |    "source": [
383 |     "prefix = 'when was'\n",
384 |     "print(prefix,\"     \")\n",
385 |     "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "Now, without any uppercase and just with the important words..."
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 35,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "name": "stdout",
402 |      "output_type": "stream",
403 |      "text": [
404 |       "when time password      \n"
405 |      ]
406 |     },
407 |     {
408 |      "data": {
409 |       "text/plain": [
410 |        "['When was the last time you changed your password?',\n",
411 |        " 'When you select you password?',\n",
412 |        " 'Take your time']"
413 |       ]
414 |      },
415 |      "execution_count": 35,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "prefix = 'when time password'\n",
422 |     "print(prefix,\"     \")\n",
423 |     "autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": []
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "# Online Sources for this project"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 34,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "# https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation\n",
447 |     "# kaggle google store competition for json read\n",
448 |     "# https://www.kaggle.com/hamishdickson/weighted-word-autocomplete-using-star-wars-dataset"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": []
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": []
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "#"
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.6.3"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 2
496 | }
497 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Autocomplete-NLP
 2 | Sentense word nlp autocomplete
 3 | 
 4 | **Imagine** that you were a representative replying to customer online and you are asking more or less the same questions over and over to your customer. Would you like to get automatic suggestions instead of typing the same thing again and again ?
 5 | 
 6 | An **autocomplete** can be helpful, faster, convenient and also correct any grammatical / spelling error at the same time.
 7 | 
 8 | 
 9 | **_Project_**:
10 | 
11 | In the jupyter notebook in this project, we select an history of sentenses written by the representatives and the customer, format and correct them using a few regex rules and count them so we can estimate their frequency and likelyness to be useful again.
12 | After the calculation of a similarity matrix based on the sklearn **tfidf** tool (frequency and normalization of words), we use this matrix to calculate the similarity between the new few words written by the representative and the history of messages written in the past.
13 | The Autocomplete will recognize the closest sentenses and rank 3 final proposals:
14 | 
15 | If you were to type: `What is your`,
16 | the tool would suggest:
17 | > What is your account number?,
18 |  What is your order number?,
19 |  What is your phone number?
20 |  
21 |  
22 | if you were to type: `Let me`
23 | the tool would suggest:
24 | > Let me investigate, Let me assist you, Let me look
25 |  
26 |  if you were to type without uppercase: `when was`
27 |  > When was the last time you changed your password?,
28 |  When was your flight scheduled for?,
29 |  When was the last time you tried?
30 |  
31 |   
32 |   
33 |    
34 |  **_Improvements_**:
35 |  1. clean up the "Mr. Smith" and "Ms. Smith" in the dataset
36 |  2. Match the letter to the words (spelling match) and then match the words to the representative sentenses history.
37 |  3. Build an evaluation of the results:
38 |  - a. offline: using unseen conversations between the representative and the customer, input the prefix of the representative in the model and match to see if the actual representative sentense is part of the 3 ranked proposals.
39 |  - b. online: count the number of time the representative actually select a proposals and count the number of time the representative decides to ignore them.
40 |  4. improve the system by first matching the customer sentenses to a topic id context in order to better predict the representative answers   
41 |  
42 |     
43 | 
44 | 


--------------------------------------------------------------------------------
/autocomplete_server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify
 2 | import autocompleter
 3 | 
 4 | app = Flask(__name__)
 5 | @app.route('/autocomplete')
 6 | 
 7 | def autocomplete():
 8 | 
 9 |     """ Generate autocompletions given the query 'q' """
10 |     
11 |     q = request.args.get('q')
12 |     completions = my_autocompleter.generate_completions(q, data_clean, model, tdidf_matrice)
13 |     return jsonify({"Completions": completions})
14 | 
15 | if __name__ == "__main__":
16 |     
17 |     my_autocompleter = autocompleter.Autocompleter()
18 |     data_orig = my_autocompleter.import_json("sample_conversations.json")
19 |     data_clean = my_autocompleter.process_data(data_orig)
20 |     model, tdidf_matrice = my_autocompleter.calc_matrice(data_clean)
21 |     print("ready to run...")
22 | 
23 |     app.run(host="0.0.0.0", port=80)


--------------------------------------------------------------------------------
/autocompleter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import json
  4 | import numpy as np
  5 | import pandas as pd
  6 | from pandas.io.json import json_normalize
  7 | import re
  8 | 
  9 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 10 | from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
 11 | from sklearn.metrics.pairwise import pairwise_distances
 12 | 
 13 | 
 14 | DATA_DIR = './'
 15 | 
 16 | def load_df(json_path='name.json'):
 17 |     """
 18 |     source: borrowed to kaggle competition gstore
 19 |     """
 20 |     df = pd.read_json(DATA_DIR+json_path)
 21 |     
 22 |     for column in ['Issues']:
 23 |         column_as_df = json_normalize(df[column])
 24 |         column_as_df.columns = [str(column+"_"+subcolumn) for subcolumn in column_as_df.columns]
 25 |         df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
 26 |     
 27 |     ## function allows to keep the index if we need to merge on the orginal data.
 28 |     df = pd.DataFrame([dict(y, index=i) for i, x in enumerate(df['Issues_Messages'].values.tolist()) for y in x])
 29 |     
 30 |     print(df.shape)
 31 |     return df
 32 | 
 33 | 
 34 | def splitDataFrameList(df,target_column,separator):
 35 |     
 36 |     ''' 
 37 |     source: https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation
 38 |     df = dataframe to split,
 39 |     target_column = the column containing the values to split
 40 |     separator = the symbol used to perform the split
 41 |     returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
 42 |     The values in the other columns are duplicated across the newly divided rows.
 43 |     '''
 44 |     def split_text(line, separator):
 45 |         splited_line =  [e+d for e in line.split(separator) if e]
 46 |         return splited_line
 47 |     
 48 |     def splitListToRows(row,row_accumulator,target_column,separator):
 49 |         split_row = row[target_column].split(separator)
 50 |         for s in split_row:
 51 |             new_row = row.to_dict()
 52 |             new_row[target_column] = s
 53 |             row_accumulator.append(new_row)
 54 |     new_rows = []
 55 |     df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
 56 |     new_df = pd.DataFrame(new_rows)
 57 |     return new_df
 58 | 
 59 | 
 60 | 
 61 | 
 62 | class Autocompleter:
 63 |     def __init__(self):
 64 |         pass
 65 | 
 66 |     def import_json(self, json_filename):
 67 |         print("load json file...")
 68 |         df = load_df(json_filename)
 69 |         return df
 70 |         
 71 |     def process_data(self, new_df):
 72 | 
 73 |         print("select representative threads...")
 74 |         new_df = new_df[new_df.IsFromCustomer==False]
 75 |         
 76 |         print("split sentenses on punctuation...")
 77 |         for sep in ['. ',', ','? ', '! ', '; ']:
 78 |             new_df = splitDataFrameList(new_df, 'Text', sep)
 79 |             
 80 |         print("Text Cleaning using simple regex...")
 81 |         new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
 82 |         new_df['Text']=new_df['Text'].apply(lambda x: x.strip("."))
 83 |         new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
 84 |         new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I '))
 85 |         new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?'))
 86 |         new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!'))
 87 |         new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.'))
 88 |         new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok'))
 89 |         new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:])
 90 |         new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x)
 91 |         
 92 |         print("calculate nb words of sentenses...")
 93 |         new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' ')))
 94 |         new_df = new_df[new_df['nb_words']>2]
 95 |         
 96 |         print("count occurence of sentenses...")
 97 |         new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count')
 98 |         
 99 |         print("remove duplicates (keep last)...")
100 |         new_df = new_df.drop_duplicates(subset=['Text'], keep='last')
101 |         
102 |         new_df = new_df.reset_index(drop=True)
103 |         print(new_df.shape)  
104 |         
105 |         return new_df
106 |     
107 |     def calc_matrice(self, df):
108 |         # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
109 |         model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0)
110 |         tfidf_matrice = model_tf.fit_transform(df['Text'])
111 |         print("tfidf_matrice ", tfidf_matrice.shape)
112 |         return model_tf, tfidf_matrice
113 | 
114 |     def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
115 |         
116 |         prefix_string = str(prefix_string)
117 |         new_df = data.reset_index(drop=True)
118 |         weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values
119 | 
120 |         # tranform the string using the tfidf model
121 |         tfidf_matrice_spelling = model_tf.transform([prefix_string])
122 |         # calculate cosine_matrix
123 |         cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling)
124 |         
125 |         #sort by order of similarity from 1 to 0:
126 |         similarity_scores = list(enumerate(cosine_similarite))
127 |         similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
128 |         similarity_scores = similarity_scores[0:10]
129 | 
130 |         similarity_scores = [i for i in similarity_scores]
131 |         similarity_indices = [i[0] for i in similarity_scores]
132 | 
133 |         #add weight to the potential results that had high frequency in orig data
134 |         for i in range(len(similarity_scores)):
135 |             similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i]
136 | 
137 |         similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
138 |         similarity_scores = similarity_scores[0:3]
139 |         similarity_indices_w = [i[0] for i in similarity_scores]
140 |         
141 |         return new_df.loc[similarity_indices_w]['Text'].tolist()
142 | 


--------------------------------------------------------------------------------
	IsFromCustomer	Text	index
0	True	Hi! I placed an order on your website and I ca...	0
1	True	I think I used my email address to log in.	0
2	True	My battery exploded!	1
3	True	It's on fire, it's melting the carpet!	1
4	True	What should I do!	1