├── .gitignore
├── LICENSE
├── README.md
└── main.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Mihail Salnikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tf-idf_and_k-means
2 | Text clustering with K-means and tf-idf
3 | 
4 | Code for [Text clustering with K-means and tf-idf](https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183) blogpost.
5 | 


--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import string\n",
 11 |     "import pandas as pd\n",
 12 |     "from functools import reduce\n",
 13 |     "from math import log"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Simple example of [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)\n",
 21 |     "1. Example of corpus\n",
 22 |     "2. Preprocessing and Tokenizing\n",
 23 |     "3. Calculating bag of words\n",
 24 |     "4. TF\n",
 25 |     "5. IDF\n",
 26 |     "6. TF-IDF"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "#1\n",
 36 |     "corpus = \"\"\"\n",
 37 |     "Simple example with Cats and Mouse\n",
 38 |     "Another simple example with dogs and cats\n",
 39 |     "Another simple example with mouse and cheese\n",
 40 |     "\"\"\".split(\"\\n\")[1:-1]"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "['simple', 'example', 'with', 'cats', 'and', 'mouse']\n",
 53 |       "['another', 'simple', 'example', 'with', 'dogs', 'and', 'cats']\n",
 54 |       "['another', 'simple', 'example', 'with', 'mouse', 'and', 'cheese']\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "#2\n",
 60 |     "l_A = corpus[0].lower().split()\n",
 61 |     "l_B = corpus[1].lower().split()\n",
 62 |     "l_C = corpus[2].lower().split()\n",
 63 |     "\n",
 64 |     "print(l_A)\n",
 65 |     "print(l_B)\n",
 66 |     "print(l_C)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "{'dogs', 'mouse', 'simple', 'and', 'cheese', 'cats', 'another', 'example', 'with'}\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "#3\n",
 84 |     "word_set = set(l_A).union(set(l_B)).union(set(l_C))\n",
 85 |     "print(word_set)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/html": [
 96 |        "<div>\n",
 97 |        "<style scoped>\n",
 98 |        "    .dataframe tbody tr th:only-of-type {\n",
 99 |        "        vertical-align: middle;\n",
100 |        "    }\n",
101 |        "\n",
102 |        "    .dataframe tbody tr th {\n",
103 |        "        vertical-align: top;\n",
104 |        "    }\n",
105 |        "\n",
106 |        "    .dataframe thead th {\n",
107 |        "        text-align: right;\n",
108 |        "    }\n",
109 |        "</style>\n",
110 |        "<table border=\"1\" class=\"dataframe\">\n",
111 |        "  <thead>\n",
112 |        "    <tr style=\"text-align: right;\">\n",
113 |        "      <th></th>\n",
114 |        "      <th>and</th>\n",
115 |        "      <th>another</th>\n",
116 |        "      <th>cats</th>\n",
117 |        "      <th>cheese</th>\n",
118 |        "      <th>dogs</th>\n",
119 |        "      <th>example</th>\n",
120 |        "      <th>mouse</th>\n",
121 |        "      <th>simple</th>\n",
122 |        "      <th>with</th>\n",
123 |        "    </tr>\n",
124 |        "  </thead>\n",
125 |        "  <tbody>\n",
126 |        "    <tr>\n",
127 |        "      <th>0</th>\n",
128 |        "      <td>1</td>\n",
129 |        "      <td>0</td>\n",
130 |        "      <td>1</td>\n",
131 |        "      <td>0</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>1</td>\n",
134 |        "      <td>1</td>\n",
135 |        "      <td>1</td>\n",
136 |        "      <td>1</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>1</th>\n",
140 |        "      <td>1</td>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>1</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>1</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>0</td>\n",
147 |        "      <td>1</td>\n",
148 |        "      <td>1</td>\n",
149 |        "    </tr>\n",
150 |        "    <tr>\n",
151 |        "      <th>2</th>\n",
152 |        "      <td>1</td>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>0</td>\n",
155 |        "      <td>1</td>\n",
156 |        "      <td>0</td>\n",
157 |        "      <td>1</td>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>1</td>\n",
160 |        "      <td>1</td>\n",
161 |        "    </tr>\n",
162 |        "  </tbody>\n",
163 |        "</table>\n",
164 |        "</div>"
165 |       ],
166 |       "text/plain": [
167 |        "   and  another  cats  cheese  dogs  example  mouse  simple  with\n",
168 |        "0    1        0     1       0     0        1      1       1     1\n",
169 |        "1    1        1     1       0     1        1      0       1     1\n",
170 |        "2    1        1     0       1     0        1      1       1     1"
171 |       ]
172 |      },
173 |      "execution_count": 5,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "word_dict_A = dict.fromkeys(word_set, 0)\n",
180 |     "word_dict_B = dict.fromkeys(word_set, 0)\n",
181 |     "word_dict_C = dict.fromkeys(word_set, 0)\n",
182 |     "\n",
183 |     "for word in l_A:\n",
184 |     "    word_dict_A[word] += 1\n",
185 |     "\n",
186 |     "for word in l_B:\n",
187 |     "    word_dict_B[word] += 1\n",
188 |     "\n",
189 |     "for word in l_C:\n",
190 |     "    word_dict_C[word] += 1\n",
191 |     "\n",
192 |     "pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## \\#4 tf - term frequency\n",
200 |     "In the case of the term frequency $tf(t,d)$, the simplest choice is to use the raw count of a term in a string. \n",
201 |     "$${\\displaystyle \\mathrm {tf} (t,d)={\\frac {n_{t}}{\\sum _{k}n_{k}}}} $$\n",
202 |     "where $n_t$ is the number of occurrences of the word $t$ in the string, and in the denominator - the total number of words in this string."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 6,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "def compute_tf(word_dict, l):\n",
212 |     "    tf = {}\n",
213 |     "    sum_nk = len(l)\n",
214 |     "    for word, count in word_dict.items():\n",
215 |     "        tf[word] = count/sum_nk\n",
216 |     "    return tf"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 7,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "tf_A = compute_tf(word_dict_A, l_A)\n",
226 |     "tf_B = compute_tf(word_dict_B, l_B)\n",
227 |     "tf_C = compute_tf(word_dict_C, l_C)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## \\#5 idf - inverse document frequency\n",
235 |     "idf is a measure of how much information the word provides\n",
236 |     "$$ \\mathrm{idf}(t, D) =  \\log \\frac{N}{|\\{d \\in D: t \\in d\\}|} $$\n",
237 |     "- $N$: total number of strings in the corpus ${\\displaystyle N={|D|}}$\n",
238 |     "- ${\\displaystyle |\\{d\\in D:t\\in d\\}|}$  : number of strings where the term ${\\displaystyle t}$ appears (i.e., ${\\displaystyle \\mathrm {tf} (t,d)\\neq 0})$. If the term is not in the corpus, this will lead to a division-by-zero. It is therefore common to adjust the denominator to ${\\displaystyle 1+|\\{d\\in D:t\\in d\\}|}$."
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 8,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "def compute_idf(strings_list):\n",
248 |     "    n = len(strings_list)\n",
249 |     "    idf = dict.fromkeys(strings_list[0].keys(), 0)\n",
250 |     "    for l in strings_list:\n",
251 |     "        for word, count in l.items():\n",
252 |     "            if count > 0:\n",
253 |     "                idf[word] += 1\n",
254 |     "    \n",
255 |     "    for word, v in idf.items():\n",
256 |     "        idf[word] = log(n / float(v))\n",
257 |     "    return idf"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 9,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "## \\# 6 tf-idf\n",
274 |     "Then tf–idf is calculated as\n",
275 |     "$$ {\\displaystyle \\mathrm {tfidf} (t,d,D)=\\mathrm {tf} (t,d)\\cdot \\mathrm {idf} (t,D)} $$"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 10,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "def compute_tf_idf(tf, idf):\n",
285 |     "    tf_idf = dict.fromkeys(tf.keys(), 0)\n",
286 |     "    for word, v in tf.items():\n",
287 |     "        tf_idf[word] = v * idf[word]\n",
288 |     "    return tf_idf"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 11,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "tf_idf_A = compute_tf_idf(tf_A, idf)\n",
298 |     "tf_idf_B = compute_tf_idf(tf_B, idf)\n",
299 |     "tf_idf_C = compute_tf_idf(tf_C, idf)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 12,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/html": [
310 |        "<div>\n",
311 |        "<style scoped>\n",
312 |        "    .dataframe tbody tr th:only-of-type {\n",
313 |        "        vertical-align: middle;\n",
314 |        "    }\n",
315 |        "\n",
316 |        "    .dataframe tbody tr th {\n",
317 |        "        vertical-align: top;\n",
318 |        "    }\n",
319 |        "\n",
320 |        "    .dataframe thead th {\n",
321 |        "        text-align: right;\n",
322 |        "    }\n",
323 |        "</style>\n",
324 |        "<table border=\"1\" class=\"dataframe\">\n",
325 |        "  <thead>\n",
326 |        "    <tr style=\"text-align: right;\">\n",
327 |        "      <th></th>\n",
328 |        "      <th>and</th>\n",
329 |        "      <th>another</th>\n",
330 |        "      <th>cats</th>\n",
331 |        "      <th>cheese</th>\n",
332 |        "      <th>dogs</th>\n",
333 |        "      <th>example</th>\n",
334 |        "      <th>mouse</th>\n",
335 |        "      <th>simple</th>\n",
336 |        "      <th>with</th>\n",
337 |        "    </tr>\n",
338 |        "  </thead>\n",
339 |        "  <tbody>\n",
340 |        "    <tr>\n",
341 |        "      <th>0</th>\n",
342 |        "      <td>0.0</td>\n",
343 |        "      <td>0.000000</td>\n",
344 |        "      <td>0.067578</td>\n",
345 |        "      <td>0.000000</td>\n",
346 |        "      <td>0.000000</td>\n",
347 |        "      <td>0.0</td>\n",
348 |        "      <td>0.067578</td>\n",
349 |        "      <td>0.0</td>\n",
350 |        "      <td>0.0</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>1</th>\n",
354 |        "      <td>0.0</td>\n",
355 |        "      <td>0.057924</td>\n",
356 |        "      <td>0.057924</td>\n",
357 |        "      <td>0.000000</td>\n",
358 |        "      <td>0.156945</td>\n",
359 |        "      <td>0.0</td>\n",
360 |        "      <td>0.000000</td>\n",
361 |        "      <td>0.0</td>\n",
362 |        "      <td>0.0</td>\n",
363 |        "    </tr>\n",
364 |        "    <tr>\n",
365 |        "      <th>2</th>\n",
366 |        "      <td>0.0</td>\n",
367 |        "      <td>0.057924</td>\n",
368 |        "      <td>0.000000</td>\n",
369 |        "      <td>0.156945</td>\n",
370 |        "      <td>0.000000</td>\n",
371 |        "      <td>0.0</td>\n",
372 |        "      <td>0.057924</td>\n",
373 |        "      <td>0.0</td>\n",
374 |        "      <td>0.0</td>\n",
375 |        "    </tr>\n",
376 |        "  </tbody>\n",
377 |        "</table>\n",
378 |        "</div>"
379 |       ],
380 |       "text/plain": [
381 |        "   and   another      cats    cheese      dogs  example     mouse  simple  \\\n",
382 |        "0  0.0  0.000000  0.067578  0.000000  0.000000      0.0  0.067578     0.0   \n",
383 |        "1  0.0  0.057924  0.057924  0.000000  0.156945      0.0  0.000000     0.0   \n",
384 |        "2  0.0  0.057924  0.000000  0.156945  0.000000      0.0  0.057924     0.0   \n",
385 |        "\n",
386 |        "   with  \n",
387 |        "0   0.0  \n",
388 |        "1   0.0  \n",
389 |        "2   0.0  "
390 |       ]
391 |      },
392 |      "execution_count": 12,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "pd.DataFrame([tf_idf_A, tf_idf_B, tf_idf_C])"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "metadata": {},
404 |    "source": [
405 |     "# For clustering we must use tf-idf weights\n",
406 |     "the example above is just an example, in practice it is better to apply [TfidfVectorizer from sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 13,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
416 |     "from sklearn.cluster import KMeans"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "## Full text for clusterring\n",
424 |     "\n",
425 |     "This corpus contain some strings about Google and some strings about TF-IDF from Wikipedia. Just for example"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 14,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "all_text = \"\"\"\n",
435 |     "Google and Facebook are strangling the free press to death. Democracy is the loser\n",
436 |     "Your 60-second guide to security stuff Google touted today at Next '18\n",
437 |     "A Guide to Using Android Without Selling Your Soul to Google\n",
438 |     "Review: Lenovo’s Google Smart Display is pretty and intelligent\n",
439 |     "Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is\n",
440 |     "Android is better than IOS\n",
441 |     "In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency\n",
442 |     "is a numerical statistic that is intended to reflect\n",
443 |     "how important a word is to a document in a collection or corpus.\n",
444 |     "It is often used as a weighting factor in searches of information retrieval\n",
445 |     "text mining, and user modeling. The tf-idf value increases proportionally\n",
446 |     "to the number of times a word appears in the document\n",
447 |     "and is offset by the frequency of the word in the corpus\n",
448 |     "\"\"\".split(\"\\n\")[1:-1]"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "## Preprocessing and tokenizing\n",
456 |     "Firstly, we must bring every chars to lowercase and remove all punctuation, because it's not important for our task, but is very harmful for clustering algorithm. \n",
457 |     "After that, we'll split strings to array of words."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 15,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "def preprocessing(line):\n",
467 |     "    line = line.lower()\n",
468 |     "    line = re.sub(r\"[{}]\".format(string.punctuation), \" \", line)\n",
469 |     "    return line"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "Now, let's calculate tf-idf for this corpus"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 16,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)\n",
486 |     "tfidf = tfidf_vectorizer.fit_transform(all_text)"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "And train simple kmeans model with k = 2"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 17,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "kmeans = KMeans(n_clusters=2).fit(tfidf)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "Predictions"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 18,
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "data": {
519 |       "text/plain": [
520 |        "array([1, 1], dtype=int32)"
521 |       ]
522 |      },
523 |      "execution_count": 18,
524 |      "metadata": {},
525 |      "output_type": "execute_result"
526 |     }
527 |    ],
528 |    "source": [
529 |     "lines_for_predicting = [\"tf and idf is awesome!\", \"some androids is there\"]\n",
530 |     "kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": []
539 |   }
540 |  ],
541 |  "metadata": {
542 |   "kernelspec": {
543 |    "display_name": "Python 3",
544 |    "language": "python",
545 |    "name": "python3"
546 |   },
547 |   "language_info": {
548 |    "codemirror_mode": {
549 |     "name": "ipython",
550 |     "version": 3
551 |    },
552 |    "file_extension": ".py",
553 |    "mimetype": "text/x-python",
554 |    "name": "python",
555 |    "nbconvert_exporter": "python",
556 |    "pygments_lexer": "ipython3",
557 |    "version": "3.6.5"
558 |   }
559 |  },
560 |  "nbformat": 4,
561 |  "nbformat_minor": 2
562 | }
563 | 


--------------------------------------------------------------------------------