├── .gitignore ├── LICENSE ├── README.md └── main.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mihail Salnikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tf-idf_and_k-means 2 | Text clustering with K-means and tf-idf 3 | 4 | Code for [Text clustering with K-means and tf-idf](https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183) blogpost. 5 | -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import string\n", 11 | "import pandas as pd\n", 12 | "from functools import reduce\n", 13 | "from math import log" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Simple example of [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)\n", 21 | "1. Example of corpus\n", 22 | "2. Preprocessing and Tokenizing\n", 23 | "3. Calculating bag of words\n", 24 | "4. TF\n", 25 | "5. IDF\n", 26 | "6. TF-IDF" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "#1\n", 36 | "corpus = \"\"\"\n", 37 | "Simple example with Cats and Mouse\n", 38 | "Another simple example with dogs and cats\n", 39 | "Another simple example with mouse and cheese\n", 40 | "\"\"\".split(\"\\n\")[1:-1]" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "['simple', 'example', 'with', 'cats', 'and', 'mouse']\n", 53 | "['another', 'simple', 'example', 'with', 'dogs', 'and', 'cats']\n", 54 | "['another', 'simple', 'example', 'with', 'mouse', 'and', 'cheese']\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "#2\n", 60 | "l_A = corpus[0].lower().split()\n", 61 | "l_B = corpus[1].lower().split()\n", 62 | "l_C = corpus[2].lower().split()\n", 63 | "\n", 64 | "print(l_A)\n", 65 | "print(l_B)\n", 66 | "print(l_C)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "{'dogs', 'mouse', 'simple', 'and', 'cheese', 'cats', 'another', 'example', 'with'}\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "#3\n", 84 | "word_set = set(l_A).union(set(l_B)).union(set(l_C))\n", 85 | "print(word_set)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/html": [ 96 | "
\n", 97 | "\n", 110 | "\n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
andanothercatscheesedogsexamplemousesimplewith
0101001111
1111011011
2110101111
\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " and another cats cheese dogs example mouse simple with\n", 168 | "0 1 0 1 0 0 1 1 1 1\n", 169 | "1 1 1 1 0 1 1 0 1 1\n", 170 | "2 1 1 0 1 0 1 1 1 1" 171 | ] 172 | }, 173 | "execution_count": 5, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "word_dict_A = dict.fromkeys(word_set, 0)\n", 180 | "word_dict_B = dict.fromkeys(word_set, 0)\n", 181 | "word_dict_C = dict.fromkeys(word_set, 0)\n", 182 | "\n", 183 | "for word in l_A:\n", 184 | " word_dict_A[word] += 1\n", 185 | "\n", 186 | "for word in l_B:\n", 187 | " word_dict_B[word] += 1\n", 188 | "\n", 189 | "for word in l_C:\n", 190 | " word_dict_C[word] += 1\n", 191 | "\n", 192 | "pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## \\#4 tf - term frequency\n", 200 | "In the case of the term frequency $tf(t,d)$, the simplest choice is to use the raw count of a term in a string. \n", 201 | "$${\\displaystyle \\mathrm {tf} (t,d)={\\frac {n_{t}}{\\sum _{k}n_{k}}}} $$\n", 202 | "where $n_t$ is the number of occurrences of the word $t$ in the string, and in the denominator - the total number of words in this string." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 6, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "def compute_tf(word_dict, l):\n", 212 | " tf = {}\n", 213 | " sum_nk = len(l)\n", 214 | " for word, count in word_dict.items():\n", 215 | " tf[word] = count/sum_nk\n", 216 | " return tf" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 7, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "tf_A = compute_tf(word_dict_A, l_A)\n", 226 | "tf_B = compute_tf(word_dict_B, l_B)\n", 227 | "tf_C = compute_tf(word_dict_C, l_C)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## \\#5 idf - inverse document frequency\n", 235 | "idf is a measure of how much information the word provides\n", 236 | "$$ \\mathrm{idf}(t, D) = \\log \\frac{N}{|\\{d \\in D: t \\in d\\}|} $$\n", 237 | "- $N$: total number of strings in the corpus ${\\displaystyle N={|D|}}$\n", 238 | "- ${\\displaystyle |\\{d\\in D:t\\in d\\}|}$ : number of strings where the term ${\\displaystyle t}$ appears (i.e., ${\\displaystyle \\mathrm {tf} (t,d)\\neq 0})$. If the term is not in the corpus, this will lead to a division-by-zero. It is therefore common to adjust the denominator to ${\\displaystyle 1+|\\{d\\in D:t\\in d\\}|}$." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 8, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "def compute_idf(strings_list):\n", 248 | " n = len(strings_list)\n", 249 | " idf = dict.fromkeys(strings_list[0].keys(), 0)\n", 250 | " for l in strings_list:\n", 251 | " for word, count in l.items():\n", 252 | " if count > 0:\n", 253 | " idf[word] += 1\n", 254 | " \n", 255 | " for word, v in idf.items():\n", 256 | " idf[word] = log(n / float(v))\n", 257 | " return idf" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 9, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "## \\# 6 tf-idf\n", 274 | "Then tf–idf is calculated as\n", 275 | "$$ {\\displaystyle \\mathrm {tfidf} (t,d,D)=\\mathrm {tf} (t,d)\\cdot \\mathrm {idf} (t,D)} $$" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 10, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "def compute_tf_idf(tf, idf):\n", 285 | " tf_idf = dict.fromkeys(tf.keys(), 0)\n", 286 | " for word, v in tf.items():\n", 287 | " tf_idf[word] = v * idf[word]\n", 288 | " return tf_idf" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 11, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "tf_idf_A = compute_tf_idf(tf_A, idf)\n", 298 | "tf_idf_B = compute_tf_idf(tf_B, idf)\n", 299 | "tf_idf_C = compute_tf_idf(tf_C, idf)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 12, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/html": [ 310 | "
\n", 311 | "\n", 324 | "\n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
andanothercatscheesedogsexamplemousesimplewith
00.00.0000000.0675780.0000000.0000000.00.0675780.00.0
10.00.0579240.0579240.0000000.1569450.00.0000000.00.0
20.00.0579240.0000000.1569450.0000000.00.0579240.00.0
\n", 378 | "
" 379 | ], 380 | "text/plain": [ 381 | " and another cats cheese dogs example mouse simple \\\n", 382 | "0 0.0 0.000000 0.067578 0.000000 0.000000 0.0 0.067578 0.0 \n", 383 | "1 0.0 0.057924 0.057924 0.000000 0.156945 0.0 0.000000 0.0 \n", 384 | "2 0.0 0.057924 0.000000 0.156945 0.000000 0.0 0.057924 0.0 \n", 385 | "\n", 386 | " with \n", 387 | "0 0.0 \n", 388 | "1 0.0 \n", 389 | "2 0.0 " 390 | ] 391 | }, 392 | "execution_count": 12, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "pd.DataFrame([tf_idf_A, tf_idf_B, tf_idf_C])" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "# For clustering we must use tf-idf weights\n", 406 | "the example above is just an example, in practice it is better to apply [TfidfVectorizer from sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 416 | "from sklearn.cluster import KMeans" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "## Full text for clusterring\n", 424 | "\n", 425 | "This corpus contain some strings about Google and some strings about TF-IDF from Wikipedia. Just for example" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 14, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "all_text = \"\"\"\n", 435 | "Google and Facebook are strangling the free press to death. Democracy is the loser\n", 436 | "Your 60-second guide to security stuff Google touted today at Next '18\n", 437 | "A Guide to Using Android Without Selling Your Soul to Google\n", 438 | "Review: Lenovo’s Google Smart Display is pretty and intelligent\n", 439 | "Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is\n", 440 | "Android is better than IOS\n", 441 | "In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency\n", 442 | "is a numerical statistic that is intended to reflect\n", 443 | "how important a word is to a document in a collection or corpus.\n", 444 | "It is often used as a weighting factor in searches of information retrieval\n", 445 | "text mining, and user modeling. The tf-idf value increases proportionally\n", 446 | "to the number of times a word appears in the document\n", 447 | "and is offset by the frequency of the word in the corpus\n", 448 | "\"\"\".split(\"\\n\")[1:-1]" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## Preprocessing and tokenizing\n", 456 | "Firstly, we must bring every chars to lowercase and remove all punctuation, because it's not important for our task, but is very harmful for clustering algorithm. \n", 457 | "After that, we'll split strings to array of words." 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 15, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "def preprocessing(line):\n", 467 | " line = line.lower()\n", 468 | " line = re.sub(r\"[{}]\".format(string.punctuation), \" \", line)\n", 469 | " return line" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "Now, let's calculate tf-idf for this corpus" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 16, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)\n", 486 | "tfidf = tfidf_vectorizer.fit_transform(all_text)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "And train simple kmeans model with k = 2" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 17, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "kmeans = KMeans(n_clusters=2).fit(tfidf)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "Predictions" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 18, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "array([1, 1], dtype=int32)" 521 | ] 522 | }, 523 | "execution_count": 18, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "lines_for_predicting = [\"tf and idf is awesome!\", \"some androids is there\"]\n", 530 | "kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.6.5" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | --------------------------------------------------------------------------------