├── .gitignore
├── LICENSE
├── README.md
└── main.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Mihail Salnikov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tf-idf_and_k-means
2 | Text clustering with K-means and tf-idf
3 |
4 | Code for [Text clustering with K-means and tf-idf](https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183) blogpost.
5 |
--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import re\n",
10 | "import string\n",
11 | "import pandas as pd\n",
12 | "from functools import reduce\n",
13 | "from math import log"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Simple example of [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)\n",
21 | "1. Example of corpus\n",
22 | "2. Preprocessing and Tokenizing\n",
23 | "3. Calculating bag of words\n",
24 | "4. TF\n",
25 | "5. IDF\n",
26 | "6. TF-IDF"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "#1\n",
36 | "corpus = \"\"\"\n",
37 | "Simple example with Cats and Mouse\n",
38 | "Another simple example with dogs and cats\n",
39 | "Another simple example with mouse and cheese\n",
40 | "\"\"\".split(\"\\n\")[1:-1]"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "['simple', 'example', 'with', 'cats', 'and', 'mouse']\n",
53 | "['another', 'simple', 'example', 'with', 'dogs', 'and', 'cats']\n",
54 | "['another', 'simple', 'example', 'with', 'mouse', 'and', 'cheese']\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "#2\n",
60 | "l_A = corpus[0].lower().split()\n",
61 | "l_B = corpus[1].lower().split()\n",
62 | "l_C = corpus[2].lower().split()\n",
63 | "\n",
64 | "print(l_A)\n",
65 | "print(l_B)\n",
66 | "print(l_C)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 4,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "name": "stdout",
76 | "output_type": "stream",
77 | "text": [
78 | "{'dogs', 'mouse', 'simple', 'and', 'cheese', 'cats', 'another', 'example', 'with'}\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "#3\n",
84 | "word_set = set(l_A).union(set(l_B)).union(set(l_C))\n",
85 | "print(word_set)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/html": [
96 | "
\n",
97 | "\n",
110 | "
\n",
111 | " \n",
112 | " \n",
113 | " | \n",
114 | " and | \n",
115 | " another | \n",
116 | " cats | \n",
117 | " cheese | \n",
118 | " dogs | \n",
119 | " example | \n",
120 | " mouse | \n",
121 | " simple | \n",
122 | " with | \n",
123 | "
\n",
124 | " \n",
125 | " \n",
126 | " \n",
127 | " 0 | \n",
128 | " 1 | \n",
129 | " 0 | \n",
130 | " 1 | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 1 | \n",
134 | " 1 | \n",
135 | " 1 | \n",
136 | " 1 | \n",
137 | "
\n",
138 | " \n",
139 | " 1 | \n",
140 | " 1 | \n",
141 | " 1 | \n",
142 | " 1 | \n",
143 | " 0 | \n",
144 | " 1 | \n",
145 | " 1 | \n",
146 | " 0 | \n",
147 | " 1 | \n",
148 | " 1 | \n",
149 | "
\n",
150 | " \n",
151 | " 2 | \n",
152 | " 1 | \n",
153 | " 1 | \n",
154 | " 0 | \n",
155 | " 1 | \n",
156 | " 0 | \n",
157 | " 1 | \n",
158 | " 1 | \n",
159 | " 1 | \n",
160 | " 1 | \n",
161 | "
\n",
162 | " \n",
163 | "
\n",
164 | "
"
165 | ],
166 | "text/plain": [
167 | " and another cats cheese dogs example mouse simple with\n",
168 | "0 1 0 1 0 0 1 1 1 1\n",
169 | "1 1 1 1 0 1 1 0 1 1\n",
170 | "2 1 1 0 1 0 1 1 1 1"
171 | ]
172 | },
173 | "execution_count": 5,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "word_dict_A = dict.fromkeys(word_set, 0)\n",
180 | "word_dict_B = dict.fromkeys(word_set, 0)\n",
181 | "word_dict_C = dict.fromkeys(word_set, 0)\n",
182 | "\n",
183 | "for word in l_A:\n",
184 | " word_dict_A[word] += 1\n",
185 | "\n",
186 | "for word in l_B:\n",
187 | " word_dict_B[word] += 1\n",
188 | "\n",
189 | "for word in l_C:\n",
190 | " word_dict_C[word] += 1\n",
191 | "\n",
192 | "pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "## \\#4 tf - term frequency\n",
200 | "In the case of the term frequency $tf(t,d)$, the simplest choice is to use the raw count of a term in a string. \n",
201 | "$${\\displaystyle \\mathrm {tf} (t,d)={\\frac {n_{t}}{\\sum _{k}n_{k}}}} $$\n",
202 | "where $n_t$ is the number of occurrences of the word $t$ in the string, and in the denominator - the total number of words in this string."
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 6,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "def compute_tf(word_dict, l):\n",
212 | " tf = {}\n",
213 | " sum_nk = len(l)\n",
214 | " for word, count in word_dict.items():\n",
215 | " tf[word] = count/sum_nk\n",
216 | " return tf"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 7,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "tf_A = compute_tf(word_dict_A, l_A)\n",
226 | "tf_B = compute_tf(word_dict_B, l_B)\n",
227 | "tf_C = compute_tf(word_dict_C, l_C)"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "## \\#5 idf - inverse document frequency\n",
235 | "idf is a measure of how much information the word provides\n",
236 | "$$ \\mathrm{idf}(t, D) = \\log \\frac{N}{|\\{d \\in D: t \\in d\\}|} $$\n",
237 | "- $N$: total number of strings in the corpus ${\\displaystyle N={|D|}}$\n",
238 | "- ${\\displaystyle |\\{d\\in D:t\\in d\\}|}$ : number of strings where the term ${\\displaystyle t}$ appears (i.e., ${\\displaystyle \\mathrm {tf} (t,d)\\neq 0})$. If the term is not in the corpus, this will lead to a division-by-zero. It is therefore common to adjust the denominator to ${\\displaystyle 1+|\\{d\\in D:t\\in d\\}|}$."
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 8,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "def compute_idf(strings_list):\n",
248 | " n = len(strings_list)\n",
249 | " idf = dict.fromkeys(strings_list[0].keys(), 0)\n",
250 | " for l in strings_list:\n",
251 | " for word, count in l.items():\n",
252 | " if count > 0:\n",
253 | " idf[word] += 1\n",
254 | " \n",
255 | " for word, v in idf.items():\n",
256 | " idf[word] = log(n / float(v))\n",
257 | " return idf"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 9,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "## \\# 6 tf-idf\n",
274 | "Then tf–idf is calculated as\n",
275 | "$$ {\\displaystyle \\mathrm {tfidf} (t,d,D)=\\mathrm {tf} (t,d)\\cdot \\mathrm {idf} (t,D)} $$"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 10,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "def compute_tf_idf(tf, idf):\n",
285 | " tf_idf = dict.fromkeys(tf.keys(), 0)\n",
286 | " for word, v in tf.items():\n",
287 | " tf_idf[word] = v * idf[word]\n",
288 | " return tf_idf"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 11,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "tf_idf_A = compute_tf_idf(tf_A, idf)\n",
298 | "tf_idf_B = compute_tf_idf(tf_B, idf)\n",
299 | "tf_idf_C = compute_tf_idf(tf_C, idf)"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 12,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/html": [
310 | "\n",
311 | "\n",
324 | "
\n",
325 | " \n",
326 | " \n",
327 | " | \n",
328 | " and | \n",
329 | " another | \n",
330 | " cats | \n",
331 | " cheese | \n",
332 | " dogs | \n",
333 | " example | \n",
334 | " mouse | \n",
335 | " simple | \n",
336 | " with | \n",
337 | "
\n",
338 | " \n",
339 | " \n",
340 | " \n",
341 | " 0 | \n",
342 | " 0.0 | \n",
343 | " 0.000000 | \n",
344 | " 0.067578 | \n",
345 | " 0.000000 | \n",
346 | " 0.000000 | \n",
347 | " 0.0 | \n",
348 | " 0.067578 | \n",
349 | " 0.0 | \n",
350 | " 0.0 | \n",
351 | "
\n",
352 | " \n",
353 | " 1 | \n",
354 | " 0.0 | \n",
355 | " 0.057924 | \n",
356 | " 0.057924 | \n",
357 | " 0.000000 | \n",
358 | " 0.156945 | \n",
359 | " 0.0 | \n",
360 | " 0.000000 | \n",
361 | " 0.0 | \n",
362 | " 0.0 | \n",
363 | "
\n",
364 | " \n",
365 | " 2 | \n",
366 | " 0.0 | \n",
367 | " 0.057924 | \n",
368 | " 0.000000 | \n",
369 | " 0.156945 | \n",
370 | " 0.000000 | \n",
371 | " 0.0 | \n",
372 | " 0.057924 | \n",
373 | " 0.0 | \n",
374 | " 0.0 | \n",
375 | "
\n",
376 | " \n",
377 | "
\n",
378 | "
"
379 | ],
380 | "text/plain": [
381 | " and another cats cheese dogs example mouse simple \\\n",
382 | "0 0.0 0.000000 0.067578 0.000000 0.000000 0.0 0.067578 0.0 \n",
383 | "1 0.0 0.057924 0.057924 0.000000 0.156945 0.0 0.000000 0.0 \n",
384 | "2 0.0 0.057924 0.000000 0.156945 0.000000 0.0 0.057924 0.0 \n",
385 | "\n",
386 | " with \n",
387 | "0 0.0 \n",
388 | "1 0.0 \n",
389 | "2 0.0 "
390 | ]
391 | },
392 | "execution_count": 12,
393 | "metadata": {},
394 | "output_type": "execute_result"
395 | }
396 | ],
397 | "source": [
398 | "pd.DataFrame([tf_idf_A, tf_idf_B, tf_idf_C])"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "# For clustering we must use tf-idf weights\n",
406 | "the example above is just an example, in practice it is better to apply [TfidfVectorizer from sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 13,
412 | "metadata": {},
413 | "outputs": [],
414 | "source": [
415 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
416 | "from sklearn.cluster import KMeans"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "## Full text for clusterring\n",
424 | "\n",
425 | "This corpus contain some strings about Google and some strings about TF-IDF from Wikipedia. Just for example"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 14,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "all_text = \"\"\"\n",
435 | "Google and Facebook are strangling the free press to death. Democracy is the loser\n",
436 | "Your 60-second guide to security stuff Google touted today at Next '18\n",
437 | "A Guide to Using Android Without Selling Your Soul to Google\n",
438 | "Review: Lenovo’s Google Smart Display is pretty and intelligent\n",
439 | "Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is\n",
440 | "Android is better than IOS\n",
441 | "In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency\n",
442 | "is a numerical statistic that is intended to reflect\n",
443 | "how important a word is to a document in a collection or corpus.\n",
444 | "It is often used as a weighting factor in searches of information retrieval\n",
445 | "text mining, and user modeling. The tf-idf value increases proportionally\n",
446 | "to the number of times a word appears in the document\n",
447 | "and is offset by the frequency of the word in the corpus\n",
448 | "\"\"\".split(\"\\n\")[1:-1]"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "## Preprocessing and tokenizing\n",
456 | "Firstly, we must bring every chars to lowercase and remove all punctuation, because it's not important for our task, but is very harmful for clustering algorithm. \n",
457 | "After that, we'll split strings to array of words."
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": 15,
463 | "metadata": {},
464 | "outputs": [],
465 | "source": [
466 | "def preprocessing(line):\n",
467 | " line = line.lower()\n",
468 | " line = re.sub(r\"[{}]\".format(string.punctuation), \" \", line)\n",
469 | " return line"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "Now, let's calculate tf-idf for this corpus"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 16,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)\n",
486 | "tfidf = tfidf_vectorizer.fit_transform(all_text)"
487 | ]
488 | },
489 | {
490 | "cell_type": "markdown",
491 | "metadata": {},
492 | "source": [
493 | "And train simple kmeans model with k = 2"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 17,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "kmeans = KMeans(n_clusters=2).fit(tfidf)"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "Predictions"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 18,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "data": {
519 | "text/plain": [
520 | "array([1, 1], dtype=int32)"
521 | ]
522 | },
523 | "execution_count": 18,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "lines_for_predicting = [\"tf and idf is awesome!\", \"some androids is there\"]\n",
530 | "kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": []
539 | }
540 | ],
541 | "metadata": {
542 | "kernelspec": {
543 | "display_name": "Python 3",
544 | "language": "python",
545 | "name": "python3"
546 | },
547 | "language_info": {
548 | "codemirror_mode": {
549 | "name": "ipython",
550 | "version": 3
551 | },
552 | "file_extension": ".py",
553 | "mimetype": "text/x-python",
554 | "name": "python",
555 | "nbconvert_exporter": "python",
556 | "pygments_lexer": "ipython3",
557 | "version": "3.6.5"
558 | }
559 | },
560 | "nbformat": 4,
561 | "nbformat_minor": 2
562 | }
563 |
--------------------------------------------------------------------------------