├── Kmeans ├── cluster_result_document.txt ├── cluster_result_keyword.txt ├── corpus_train.txt ├── k_select.png └── kmeans_cluster.py ├── Lda ├── cluster_keywords_lda.txt ├── cluster_keywords_lsi.txt ├── corpus_train.txt └── lda_cluster.py └── README.md /Kmeans/cluster_result_document.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 2,3 3 | 3,1 4 | 4,2 5 | 5,5 6 | 6,4 7 | 7,4 8 | 8,2 9 | 9,6 10 | 10,2 11 | 11,5 12 | 12,4 13 | 13,4 14 | 14,1 15 | 15,1 16 | 16,5 17 | 17,2 18 | 18,0 19 | 19,3 20 | 20,4 21 | 21,1 22 | 22,5 23 | 23,0 24 | 24,4 25 | 25,1 26 | 26,6 27 | 27,4 28 | 28,6 29 | 29,4 30 | 30,5 31 | 31,1 32 | 32,2 33 | 33,4 34 | 34,5 35 | 35,4 36 | 36,2 37 | 37,4 38 | 38,2 39 | 39,6 40 | 40,0 41 | 41,3 42 | 42,4 43 | 43,6 44 | 44,2 45 | 45,4 46 | 46,4 47 | 47,3 48 | 48,2 49 | 49,4 50 | 50,2 51 | 51,0 52 | 52,1 53 | 53,4 54 | 54,5 55 | 55,5 56 | 56,1 57 | 57,0 58 | 58,2 59 | 59,0 60 | 60,2 61 | 61,2 62 | 62,2 63 | 63,1 64 | 64,2 65 | 65,6 66 | 66,3 67 | 67,3 68 | 68,6 69 | 69,1 70 | 70,2 71 | 71,0 72 | 72,4 73 | 73,6 74 | 74,0 75 | 75,2 76 | 76,2 77 | 77,5 78 | 78,1 79 | 79,1 80 | 80,4 81 | 81,2 82 | 82,4 83 | 83,1 84 | 84,5 85 | 85,4 86 | 86,1 87 | 87,5 88 | 88,4 89 | 89,2 90 | 90,2 91 | 91,4 92 | 92,5 93 | 93,4 94 | 94,5 95 | 95,4 96 | 96,1 97 | 97,1 98 | 98,6 99 | 99,4 100 | 100,1 101 | 101,0 102 | 102,3 103 | 103,3 104 | 104,1 105 | 105,3 106 | 106,3 107 | 107,1 108 | 108,0 109 | 109,4 110 | 110,3 111 | 111,4 112 | 112,4 113 | 113,6 114 | 114,5 115 | 115,4 116 | 116,5 117 | 117,1 118 | 118,3 119 | 119,3 120 | 120,2 121 | 121,4 122 | 122,4 123 | 123,2 124 | 124,0 125 | 125,5 126 | 126,1 127 | 127,5 128 | 128,2 129 | 129,3 130 | 130,6 131 | 131,5 132 | 132,6 133 | 133,2 134 | 134,0 135 | 135,4 136 | 136,2 137 | 137,4 138 | 138,4 139 | 139,3 140 | 140,6 141 | 141,4 142 | 142,2 143 | 143,3 144 | 144,4 145 | 145,2 146 | 146,3 147 | 147,0 148 | 148,6 149 | 149,2 150 | 150,2 151 | 151,3 152 | 152,0 153 | 153,2 154 | 154,4 155 | 155,4 156 | 156,4 157 | 157,5 158 | 158,3 159 | 159,2 160 | 160,2 161 | 161,5 162 | 162,6 163 | 163,4 164 | 164,5 165 | 165,0 166 | 166,5 167 | 167,3 168 | 168,4 169 | 169,3 170 | 170,2 171 | 171,5 172 | 172,2 173 | 173,1 174 | 174,3 175 | 175,6 176 | 176,4 177 | 177,4 178 | 178,6 179 | 179,2 180 | 180,2 181 | 181,3 182 | 182,6 183 | 183,3 184 | 184,1 185 | 185,2 186 | 186,0 187 | 187,3 188 | 188,0 189 | 189,1 190 | 190,3 191 | 191,6 192 | 192,3 193 | 193,6 194 | 194,5 195 | 195,0 196 | 196,2 197 | 197,4 198 | 198,1 199 | 199,3 200 | 200,3 201 | 201,1 202 | 202,1 203 | 203,6 204 | 204,5 205 | 205,1 206 | 206,4 207 | 207,5 208 | 208,4 209 | 209,4 210 | 210,3 211 | 211,3 212 | 212,5 213 | 213,5 214 | 214,6 215 | 215,2 216 | 216,6 217 | 217,6 218 | 218,5 219 | 219,0 220 | 220,0 221 | 221,0 222 | 222,5 223 | 223,3 224 | 224,3 225 | 225,2 226 | 226,2 227 | 227,4 228 | 228,4 229 | 229,3 230 | 230,4 231 | 231,4 232 | 232,4 233 | 233,2 234 | 234,6 235 | 235,2 236 | 236,3 237 | 237,2 238 | 238,1 239 | 239,0 240 | 240,0 241 | 241,2 242 | 242,3 243 | 243,2 244 | 244,6 245 | 245,3 246 | 246,3 247 | 247,0 248 | 248,4 249 | 249,4 250 | 250,4 251 | 251,5 252 | 252,2 253 | 253,5 254 | 254,5 255 | 255,6 256 | 256,4 257 | 257,6 258 | 258,3 259 | 259,4 260 | 260,3 261 | 261,4 262 | 262,2 263 | 263,1 264 | 264,4 265 | 265,5 266 | 266,2 267 | 267,5 268 | 268,6 269 | 269,6 270 | 270,4 271 | 271,4 272 | 272,2 273 | 273,2 274 | 274,6 275 | 275,1 276 | 276,5 277 | 277,6 278 | 278,2 279 | 279,4 280 | 280,1 281 | 281,6 282 | 282,0 283 | 283,6 284 | 284,5 285 | 285,4 286 | 286,2 287 | 287,4 288 | 288,0 289 | 289,6 290 | 290,1 291 | 291,2 292 | 292,2 293 | 293,3 294 | 294,6 295 | 295,4 296 | 296,4 297 | 297,4 298 | 298,0 299 | 299,6 300 | 300,6 301 | 301,5 302 | 302,2 303 | 303,5 304 | 304,4 305 | 305,5 306 | 306,4 307 | 307,3 308 | 308,1 309 | 309,4 310 | 310,6 311 | 311,3 312 | 312,2 313 | 313,4 314 | 314,1 315 | 315,3 316 | 316,1 317 | 317,4 318 | 318,2 319 | 319,2 320 | 320,2 321 | 321,2 322 | 322,1 323 | 323,0 324 | 324,2 325 | 325,4 326 | 326,2 327 | 327,6 328 | 328,1 329 | 329,4 330 | 330,2 331 | 331,2 332 | 332,1 333 | 333,0 334 | 334,5 335 | 335,2 336 | 336,0 337 | 337,2 338 | 338,5 339 | 339,6 340 | 340,5 341 | 341,4 342 | 342,3 343 | 343,6 344 | 344,1 345 | 345,6 346 | 346,6 347 | 347,1 348 | 348,0 349 | 349,3 350 | 350,5 351 | 351,2 352 | 352,4 353 | 353,4 354 | 354,0 355 | 355,0 356 | 356,4 357 | 357,4 358 | 358,2 359 | 359,5 360 | 360,4 361 | 361,2 362 | 362,0 363 | 363,1 364 | 364,2 365 | 365,2 366 | 366,3 367 | 367,1 368 | 368,4 369 | 369,5 370 | 370,3 371 | 371,6 372 | 372,3 373 | 373,6 374 | 374,1 375 | 375,5 376 | 376,0 377 | 377,2 378 | 378,6 379 | 379,1 380 | 380,2 381 | 381,6 382 | 382,0 383 | 383,6 384 | 384,5 385 | 385,3 386 | 386,2 387 | 387,4 388 | 388,6 389 | 389,6 390 | 390,3 391 | 391,4 392 | 392,6 393 | 393,3 394 | 394,4 395 | 395,4 396 | 396,4 397 | 397,1 398 | 398,2 399 | 399,4 400 | 400,1 401 | 401,3 402 | 402,5 403 | 403,4 404 | 404,6 405 | 405,4 406 | 406,1 407 | 407,3 408 | 408,0 409 | 409,4 410 | 410,6 411 | 411,0 412 | 412,6 413 | 413,2 414 | 414,2 415 | 415,4 416 | 416,0 417 | 417,6 418 | 418,3 419 | 419,3 420 | 420,2 421 | 421,1 422 | 422,5 423 | 423,0 424 | 424,1 425 | 425,2 426 | 426,4 427 | 427,5 428 | 428,2 429 | 429,0 430 | 430,6 431 | 431,4 432 | 432,2 433 | 433,4 434 | 434,1 435 | 435,0 436 | 436,0 437 | 437,2 438 | 438,2 439 | 439,6 440 | 440,5 441 | 441,3 442 | 442,6 443 | 443,2 444 | 444,2 445 | 445,2 446 | 446,3 447 | 447,2 448 | 448,2 449 | 449,2 450 | 450,5 451 | 451,1 452 | 452,1 453 | 453,4 454 | 454,3 455 | 455,3 456 | 456,4 457 | 457,2 458 | 458,4 459 | 459,3 460 | 460,1 461 | 461,0 462 | 462,3 463 | 463,6 464 | 464,3 465 | 465,1 466 | 466,2 467 | 467,4 468 | 468,6 469 | 469,1 470 | 470,2 471 | 471,6 472 | 472,2 473 | 473,2 474 | 474,2 475 | 475,1 476 | 476,5 477 | 477,1 478 | 478,1 479 | 479,2 480 | 480,0 481 | 481,1 482 | 482,1 483 | 483,2 484 | 484,5 485 | 485,4 486 | 486,6 487 | 487,2 488 | 488,4 489 | 489,4 490 | 490,2 491 | 491,1 492 | 492,3 493 | 493,5 494 | 494,0 495 | 495,1 496 | 496,0 497 | 497,2 498 | 498,2 499 | 499,3 500 | 500,0 501 | 501,4 502 | 502,1 503 | 503,0 504 | 504,4 505 | 505,5 506 | 506,4 507 | 507,5 508 | 508,2 509 | 509,1 510 | 510,4 511 | 511,2 512 | 512,4 513 | 513,2 514 | 514,2 515 | 515,2 516 | 516,3 517 | 517,5 518 | 518,4 519 | 519,2 520 | 520,2 521 | 521,2 522 | 522,3 523 | 523,3 524 | 524,1 525 | 525,2 526 | 526,4 527 | 527,4 528 | 528,1 529 | 529,6 530 | 530,0 531 | 531,2 532 | 532,2 533 | 533,1 534 | 534,4 535 | 535,2 536 | 536,5 537 | 537,3 538 | 538,0 539 | 539,4 540 | 540,4 541 | 541,3 542 | 542,4 543 | 543,4 544 | 544,6 545 | 545,4 546 | 546,4 547 | 547,1 548 | 548,6 549 | 549,3 550 | 550,2 551 | 551,5 552 | 552,3 553 | 553,4 554 | 554,4 555 | 555,2 556 | 556,1 557 | 557,0 558 | 558,6 559 | 559,1 560 | 560,1 561 | 561,5 562 | 562,5 563 | 563,0 564 | 564,2 565 | 565,5 566 | 566,0 567 | 567,0 568 | 568,1 569 | 569,5 570 | 570,2 571 | 571,1 572 | 572,5 573 | 573,4 574 | 574,2 575 | 575,4 576 | 576,3 577 | 577,2 578 | 578,2 579 | 579,2 580 | 580,5 581 | 581,2 582 | 582,6 583 | 583,2 584 | 584,4 585 | 585,3 586 | 586,2 587 | 587,2 588 | 588,3 589 | 589,2 590 | 590,2 591 | 591,1 592 | 592,5 593 | 593,3 594 | 594,0 595 | 595,2 596 | 596,2 597 | 597,3 598 | 598,3 599 | 599,2 600 | 600,0 601 | 601,2 602 | 602,0 603 | 603,2 604 | 604,4 605 | 605,2 606 | 606,4 607 | 607,1 608 | 608,4 609 | 609,6 610 | 610,3 611 | 611,3 612 | 612,1 613 | 613,4 614 | 614,2 615 | 615,6 616 | 616,0 617 | 617,2 618 | 618,4 619 | 619,1 620 | 620,2 621 | 621,6 622 | 622,2 623 | 623,2 624 | 624,4 625 | 625,0 626 | 626,4 627 | 627,4 628 | 628,2 629 | 629,4 630 | 630,2 631 | 631,0 632 | 632,6 633 | 633,4 634 | 634,5 635 | 635,0 636 | 636,4 637 | 637,0 638 | 638,2 639 | 639,6 640 | 640,5 641 | 641,4 642 | 642,2 643 | 643,4 644 | 644,1 645 | 645,3 646 | 646,4 647 | 647,3 648 | 648,4 649 | 649,4 650 | 650,4 651 | 651,1 652 | 652,4 653 | 653,0 654 | 654,6 655 | 655,4 656 | 656,4 657 | 657,4 658 | 658,2 659 | 659,1 660 | 660,5 661 | 661,2 662 | 662,4 663 | 663,6 664 | 664,5 665 | 665,2 666 | 666,2 667 | 667,3 668 | 668,4 669 | 669,6 670 | 670,2 671 | 671,5 672 | 672,2 673 | 673,3 674 | 674,0 675 | 675,1 676 | 676,6 677 | 677,4 678 | 678,0 679 | 679,2 680 | 680,4 681 | 681,2 682 | 682,2 683 | 683,3 684 | 684,4 685 | 685,4 686 | 686,4 687 | 687,4 688 | 688,6 689 | 689,4 690 | 690,0 691 | 691,2 692 | 692,3 693 | 693,4 694 | 694,2 695 | 695,0 696 | 696,2 697 | 697,1 698 | 698,4 699 | 699,1 700 | 700,2 701 | 701,4 702 | 702,2 703 | 703,6 704 | 704,5 705 | 705,2 706 | 706,5 707 | 707,1 708 | 708,2 709 | 709,6 710 | 710,3 711 | 711,2 712 | 712,5 713 | 713,0 714 | 714,5 715 | 715,4 716 | 716,2 717 | 717,4 718 | 718,6 719 | 719,2 720 | 720,1 721 | 721,2 722 | 722,3 723 | 723,0 724 | 724,0 725 | 725,5 726 | 726,5 727 | 727,6 728 | 728,4 729 | 729,6 730 | 730,2 731 | 731,2 732 | 732,5 733 | 733,3 734 | 734,5 735 | 735,1 736 | 736,1 737 | 737,2 738 | 738,5 739 | 739,3 740 | 740,1 741 | 741,4 742 | 742,5 743 | 743,2 744 | 744,1 745 | 745,1 746 | 746,5 747 | 747,4 748 | 748,6 749 | 749,6 750 | 750,2 751 | 751,5 752 | 752,3 753 | 753,4 754 | 754,3 755 | 755,1 756 | 756,2 757 | 757,1 758 | 758,5 759 | 759,5 760 | 760,1 761 | 761,6 762 | 762,2 763 | 763,1 764 | 764,5 765 | 765,4 766 | 766,1 767 | 767,3 768 | 768,4 769 | 769,4 770 | 770,2 771 | 771,2 772 | 772,1 773 | 773,5 774 | 774,2 775 | 775,0 776 | 776,2 777 | 777,4 778 | 778,0 779 | 779,3 780 | 780,4 781 | 781,2 782 | 782,6 783 | 783,6 784 | 784,1 785 | 785,6 786 | 786,2 787 | 787,4 788 | 788,2 789 | 789,2 790 | 790,2 791 | 791,2 792 | 792,5 793 | 793,6 794 | 794,3 795 | 795,1 796 | 796,3 797 | 797,1 798 | 798,0 799 | 799,2 800 | 800,4 801 | 801,2 802 | 802,5 803 | 803,2 804 | 804,2 805 | 805,4 806 | 806,0 807 | 807,2 808 | 808,1 809 | 809,5 810 | 810,1 811 | 811,3 812 | 812,5 813 | 813,4 814 | 814,3 815 | 815,4 816 | 816,4 817 | 817,6 818 | 818,0 819 | 819,2 820 | 820,2 821 | 821,0 822 | 822,2 823 | 823,4 824 | 824,1 825 | 825,2 826 | 826,4 827 | 827,6 828 | 828,1 829 | 829,1 830 | 830,1 831 | 831,2 832 | 832,0 833 | 833,5 834 | 834,2 835 | 835,3 836 | 836,1 837 | 837,2 838 | 838,3 839 | 839,2 840 | 840,3 841 | 841,1 842 | 842,6 843 | 843,2 844 | 844,1 845 | 845,4 846 | 846,6 847 | 847,3 848 | 848,4 849 | 849,3 850 | 850,2 851 | 851,1 852 | 852,6 853 | 853,5 854 | 854,3 855 | 855,5 856 | 856,3 857 | 857,4 858 | 858,4 859 | 859,2 860 | 860,2 861 | 861,4 862 | 862,2 863 | 863,4 864 | 864,5 865 | 865,4 866 | 866,1 867 | 867,4 868 | 868,6 869 | 869,1 870 | 870,2 871 | 871,2 872 | 872,1 873 | 873,1 874 | 874,6 875 | 875,2 876 | 876,2 877 | 877,6 878 | 878,1 879 | 879,4 880 | 880,5 881 | 881,1 882 | 882,6 883 | 883,0 884 | 884,4 885 | 885,0 886 | 886,3 887 | 887,6 888 | 888,6 889 | 889,2 890 | 890,4 891 | 891,2 892 | 892,4 893 | 893,2 894 | 894,5 895 | 895,2 896 | 896,5 897 | 897,5 898 | 898,5 899 | 899,5 900 | 900,2 901 | 901,6 902 | 902,2 903 | 903,1 904 | 904,5 905 | 905,0 906 | 906,1 907 | 907,1 908 | 908,5 909 | 909,5 910 | 910,5 911 | 911,6 912 | 912,6 913 | 913,0 914 | 914,4 915 | 915,6 916 | 916,5 917 | 917,2 918 | 918,3 919 | 919,3 920 | 920,2 921 | 921,2 922 | 922,6 923 | 923,1 924 | 924,5 925 | 925,4 926 | 926,5 927 | 927,5 928 | 928,0 929 | 929,4 930 | 930,2 931 | 931,6 932 | 932,1 933 | 933,3 934 | 934,6 935 | 935,1 936 | 936,4 937 | 937,3 938 | 938,2 939 | 939,0 940 | 940,5 941 | 941,5 942 | 942,3 943 | 943,6 944 | 944,0 945 | 945,0 946 | 946,4 947 | 947,0 948 | 948,4 949 | 949,2 950 | 950,2 951 | 951,3 952 | 952,6 953 | 953,5 954 | 954,1 955 | 955,2 956 | 956,2 957 | 957,0 958 | 958,1 959 | 959,1 960 | 960,3 961 | 961,2 962 | 962,2 963 | 963,1 964 | 964,3 965 | 965,4 966 | 966,2 967 | 967,0 968 | 968,4 969 | 969,5 970 | 970,5 971 | 971,5 972 | 972,3 973 | 973,4 974 | 974,6 975 | 975,2 976 | 976,6 977 | 977,2 978 | 978,5 979 | 979,6 980 | 980,2 981 | 981,4 982 | 982,2 983 | 983,2 984 | 984,3 985 | 985,4 986 | 986,4 987 | 987,1 988 | 988,3 989 | 989,4 990 | 990,5 991 | 991,2 992 | 992,1 993 | 993,1 994 | 994,5 995 | 995,4 996 | 996,4 997 | 997,6 998 | 998,2 999 | 999,6 1000 | 1000,4 1001 | 1001,3 1002 | 1002,4 1003 | 1003,5 1004 | 1004,5 1005 | 1005,2 1006 | 1006,3 1007 | 1007,2 1008 | 1008,0 1009 | 1009,2 1010 | 1010,1 1011 | 1011,2 1012 | 1012,6 1013 | 1013,5 1014 | 1014,2 1015 | 1015,4 1016 | 1016,6 1017 | 1017,2 1018 | 1018,6 1019 | 1019,4 1020 | 1020,2 1021 | 1021,1 1022 | 1022,2 1023 | 1023,6 1024 | 1024,3 1025 | 1025,3 1026 | 1026,2 1027 | 1027,6 1028 | 1028,3 1029 | 1029,2 1030 | 1030,6 1031 | 1031,2 1032 | 1032,1 1033 | 1033,2 1034 | 1034,1 1035 | 1035,4 1036 | 1036,2 1037 | 1037,2 1038 | 1038,3 1039 | 1039,2 1040 | 1040,1 1041 | 1041,4 1042 | 1042,5 1043 | 1043,4 1044 | 1044,2 1045 | 1045,6 1046 | 1046,0 1047 | 1047,6 1048 | 1048,1 1049 | 1049,2 1050 | 1050,4 1051 | 1051,2 1052 | 1052,0 1053 | 1053,4 1054 | 1054,5 1055 | 1055,2 1056 | 1056,2 1057 | 1057,2 1058 | 1058,3 1059 | 1059,5 1060 | 1060,4 1061 | 1061,5 1062 | 1062,4 1063 | 1063,1 1064 | 1064,1 1065 | 1065,2 1066 | 1066,1 1067 | 1067,3 1068 | 1068,0 1069 | 1069,2 1070 | 1070,5 1071 | 1071,4 1072 | 1072,4 1073 | 1073,4 1074 | 1074,3 1075 | 1075,0 1076 | 1076,6 1077 | 1077,4 1078 | 1078,5 1079 | 1079,0 1080 | 1080,4 1081 | 1081,5 1082 | 1082,4 1083 | 1083,1 1084 | 1084,4 1085 | 1085,4 1086 | 1086,3 1087 | 1087,2 1088 | 1088,1 1089 | 1089,1 1090 | 1090,1 1091 | 1091,0 1092 | 1092,2 1093 | 1093,3 1094 | 1094,4 1095 | 1095,0 1096 | 1096,3 1097 | 1097,2 1098 | 1098,1 1099 | 1099,6 1100 | 1100,4 1101 | 1101,6 1102 | 1102,1 1103 | 1103,6 1104 | 1104,2 1105 | 1105,1 1106 | 1106,5 1107 | 1107,5 1108 | 1108,0 1109 | 1109,2 1110 | 1110,6 1111 | 1111,5 1112 | 1112,4 1113 | 1113,2 1114 | 1114,6 1115 | 1115,6 1116 | 1116,3 1117 | 1117,6 1118 | 1118,4 1119 | 1119,2 1120 | 1120,4 1121 | 1121,4 1122 | 1122,4 1123 | 1123,1 1124 | 1124,1 1125 | 1125,4 1126 | 1126,2 1127 | 1127,3 1128 | 1128,0 1129 | 1129,1 1130 | 1130,4 1131 | 1131,3 1132 | 1132,2 1133 | 1133,4 1134 | 1134,5 1135 | 1135,5 1136 | 1136,6 1137 | 1137,5 1138 | 1138,4 1139 | 1139,2 1140 | 1140,4 1141 | 1141,2 1142 | 1142,1 1143 | 1143,5 1144 | 1144,0 1145 | 1145,6 1146 | 1146,0 1147 | 1147,1 1148 | 1148,1 1149 | 1149,2 1150 | 1150,1 1151 | 1151,6 1152 | 1152,3 1153 | 1153,4 1154 | 1154,6 1155 | 1155,2 1156 | 1156,3 1157 | 1157,2 1158 | 1158,5 1159 | 1159,1 1160 | 1160,6 1161 | 1161,5 1162 | 1162,4 1163 | 1163,0 1164 | 1164,2 1165 | 1165,4 1166 | 1166,6 1167 | 1167,2 1168 | 1168,2 1169 | 1169,2 1170 | 1170,0 1171 | 1171,6 1172 | 1172,2 1173 | 1173,4 1174 | 1174,1 1175 | 1175,6 1176 | 1176,4 1177 | 1177,1 1178 | 1178,4 1179 | 1179,1 1180 | 1180,2 1181 | 1181,1 1182 | 1182,3 1183 | 1183,2 1184 | 1184,3 1185 | 1185,0 1186 | 1186,3 1187 | 1187,6 1188 | 1188,0 1189 | 1189,3 1190 | 1190,6 1191 | 1191,3 1192 | 1192,2 1193 | 1193,3 1194 | 1194,6 1195 | 1195,6 1196 | 1196,1 1197 | 1197,1 1198 | 1198,2 1199 | 1199,4 1200 | 1200,5 1201 | 1201,6 1202 | 1202,1 1203 | 1203,5 1204 | 1204,1 1205 | 1205,2 1206 | 1206,2 1207 | 1207,2 1208 | 1208,4 1209 | 1209,5 1210 | 1210,4 1211 | 1211,6 1212 | 1212,1 1213 | 1213,2 1214 | 1214,1 1215 | 1215,4 1216 | 1216,5 1217 | 1217,5 1218 | 1218,6 1219 | 1219,4 1220 | 1220,4 1221 | 1221,2 1222 | 1222,3 1223 | 1223,2 1224 | 1224,3 1225 | 1225,4 1226 | 1226,2 1227 | 1227,2 1228 | 1228,6 1229 | 1229,2 1230 | 1230,5 1231 | 1231,4 1232 | 1232,5 1233 | 1233,0 1234 | 1234,5 1235 | 1235,2 1236 | 1236,1 1237 | 1237,4 1238 | 1238,4 1239 | 1239,2 1240 | 1240,4 1241 | 1241,2 1242 | 1242,6 1243 | 1243,2 1244 | 1244,2 1245 | 1245,1 1246 | 1246,0 1247 | 1247,2 1248 | 1248,2 1249 | 1249,4 1250 | 1250,5 1251 | 1251,5 1252 | 1252,2 1253 | 1253,5 1254 | 1254,1 1255 | 1255,4 1256 | 1256,3 1257 | 1257,3 1258 | 1258,3 1259 | 1259,3 1260 | 1260,6 1261 | 1261,6 1262 | 1262,1 1263 | 1263,2 1264 | 1264,2 1265 | 1265,4 1266 | 1266,4 1267 | 1267,5 1268 | 1268,6 1269 | 1269,4 1270 | 1270,1 1271 | 1271,2 1272 | 1272,5 1273 | 1273,2 1274 | 1274,4 1275 | 1275,1 1276 | 1276,6 1277 | 1277,2 1278 | 1278,6 1279 | 1279,0 1280 | 1280,4 1281 | 1281,6 1282 | 1282,6 1283 | 1283,2 1284 | 1284,6 1285 | 1285,6 1286 | 1286,4 1287 | 1287,2 1288 | 1288,0 1289 | 1289,2 1290 | 1290,5 1291 | 1291,4 1292 | 1292,2 1293 | 1293,2 1294 | 1294,2 1295 | 1295,0 1296 | 1296,4 1297 | 1297,5 1298 | 1298,4 1299 | 1299,5 1300 | 1300,1 1301 | 1301,5 1302 | 1302,2 1303 | 1303,2 1304 | 1304,6 1305 | 1305,3 1306 | 1306,2 1307 | 1307,4 1308 | 1308,1 1309 | 1309,1 1310 | 1310,2 1311 | 1311,4 1312 | 1312,1 1313 | 1313,3 1314 | 1314,2 1315 | 1315,3 1316 | 1316,2 1317 | 1317,2 1318 | 1318,3 1319 | 1319,2 1320 | 1320,1 1321 | 1321,5 1322 | 1322,1 1323 | 1323,2 1324 | 1324,4 1325 | 1325,1 1326 | 1326,2 1327 | 1327,0 1328 | 1328,2 1329 | 1329,2 1330 | 1330,6 1331 | 1331,3 1332 | 1332,5 1333 | 1333,3 1334 | 1334,4 1335 | 1335,4 1336 | 1336,5 1337 | 1337,5 1338 | 1338,2 1339 | 1339,5 1340 | 1340,0 1341 | 1341,1 1342 | 1342,4 1343 | 1343,3 1344 | 1344,3 1345 | 1345,2 1346 | 1346,3 1347 | 1347,4 1348 | 1348,2 1349 | 1349,1 1350 | 1350,3 1351 | 1351,6 1352 | 1352,5 1353 | 1353,2 1354 | 1354,6 1355 | 1355,5 1356 | 1356,4 1357 | 1357,3 1358 | 1358,2 1359 | 1359,2 1360 | 1360,3 1361 | 1361,3 1362 | 1362,3 1363 | 1363,2 1364 | 1364,0 1365 | 1365,0 1366 | 1366,1 1367 | 1367,2 1368 | 1368,4 1369 | 1369,2 1370 | 1370,0 1371 | 1371,4 1372 | 1372,6 1373 | 1373,4 1374 | 1374,0 1375 | 1375,6 1376 | 1376,4 1377 | 1377,2 1378 | 1378,5 1379 | 1379,0 1380 | 1380,2 1381 | 1381,1 1382 | 1382,2 1383 | 1383,2 1384 | 1384,3 1385 | 1385,1 1386 | 1386,1 1387 | 1387,2 1388 | 1388,2 1389 | 1389,3 1390 | 1390,4 1391 | 1391,1 1392 | 1392,4 1393 | 1393,2 1394 | 1394,4 1395 | 1395,2 1396 | 1396,3 1397 | 1397,4 1398 | 1398,4 1399 | 1399,6 1400 | 1400,1 1401 | 1401,6 1402 | 1402,1 1403 | 1403,2 1404 | 1404,4 1405 | 1405,0 1406 | 1406,1 1407 | 1407,6 1408 | 1408,4 1409 | 1409,1 1410 | 1410,1 1411 | 1411,5 1412 | 1412,5 1413 | 1413,5 1414 | 1414,6 1415 | 1415,2 1416 | 1416,0 1417 | 1417,1 1418 | 1418,3 1419 | 1419,4 1420 | 1420,2 1421 | 1421,2 1422 | 1422,6 1423 | 1423,0 1424 | 1424,4 1425 | 1425,6 1426 | 1426,0 1427 | 1427,6 1428 | 1428,6 1429 | 1429,4 1430 | 1430,2 1431 | 1431,2 1432 | 1432,2 1433 | 1433,4 1434 | 1434,3 1435 | 1435,2 1436 | 1436,3 1437 | 1437,2 1438 | 1438,1 1439 | 1439,6 1440 | 1440,5 1441 | 1441,2 1442 | 1442,4 1443 | 1443,3 1444 | 1444,4 1445 | 1445,5 1446 | 1446,1 1447 | 1447,4 1448 | 1448,4 1449 | 1449,4 1450 | 1450,4 1451 | 1451,6 1452 | 1452,0 1453 | 1453,1 1454 | 1454,4 1455 | 1455,4 1456 | 1456,4 1457 | 1457,2 1458 | 1458,5 1459 | 1459,2 1460 | 1460,4 1461 | 1461,1 1462 | 1462,2 1463 | 1463,6 1464 | 1464,5 1465 | 1465,6 1466 | 1466,4 1467 | 1467,6 1468 | 1468,3 1469 | 1469,2 1470 | 1470,6 1471 | 1471,1 1472 | 1472,1 1473 | 1473,4 1474 | 1474,4 1475 | 1475,6 1476 | 1476,5 1477 | 1477,1 1478 | 1478,2 1479 | 1479,2 1480 | 1480,6 1481 | 1481,6 1482 | 1482,4 1483 | 1483,2 1484 | 1484,6 1485 | 1485,6 1486 | 1486,4 1487 | 1487,4 1488 | 1488,4 1489 | 1489,4 1490 | 1490,5 1491 | 1491,5 1492 | 1492,5 1493 | 1493,2 1494 | 1494,3 1495 | 1495,2 1496 | 1496,0 1497 | 1497,0 1498 | 1498,2 1499 | 1499,1 1500 | 1500,6 1501 | 1501,5 1502 | 1502,4 1503 | 1503,4 1504 | 1504,5 1505 | 1505,4 1506 | 1506,2 1507 | 1507,2 1508 | 1508,4 1509 | 1509,4 1510 | 1510,2 1511 | 1511,1 1512 | 1512,1 1513 | 1513,3 1514 | 1514,0 1515 | 1515,2 1516 | 1516,3 1517 | 1517,1 1518 | 1518,4 1519 | 1519,2 1520 | 1520,1 1521 | 1521,3 1522 | 1522,4 1523 | 1523,1 1524 | 1524,2 1525 | 1525,5 1526 | 1526,2 1527 | 1527,6 1528 | 1528,2 1529 | 1529,5 1530 | 1530,2 1531 | 1531,2 1532 | 1532,1 1533 | 1533,6 1534 | 1534,1 1535 | 1535,4 1536 | 1536,1 1537 | 1537,2 1538 | 1538,0 1539 | 1539,4 1540 | 1540,1 1541 | 1541,2 1542 | 1542,2 1543 | 1543,6 1544 | 1544,1 1545 | 1545,2 1546 | 1546,2 1547 | 1547,4 1548 | 1548,4 1549 | 1549,1 1550 | 1550,4 1551 | 1551,1 1552 | 1552,6 1553 | 1553,2 1554 | 1554,4 1555 | 1555,2 1556 | 1556,4 1557 | 1557,2 1558 | 1558,4 1559 | 1559,4 1560 | 1560,4 1561 | 1561,1 1562 | 1562,3 1563 | 1563,2 1564 | 1564,4 1565 | 1565,6 1566 | 1566,1 1567 | 1567,1 1568 | 1568,5 1569 | 1569,2 1570 | 1570,5 1571 | 1571,4 1572 | 1572,4 1573 | 1573,3 1574 | 1574,1 1575 | 1575,2 1576 | 1576,2 1577 | 1577,2 1578 | 1578,1 1579 | 1579,4 1580 | 1580,0 1581 | 1581,0 1582 | 1582,2 1583 | 1583,3 1584 | 1584,0 1585 | 1585,3 1586 | 1586,3 1587 | 1587,3 1588 | 1588,2 1589 | 1589,6 1590 | 1590,4 1591 | 1591,6 1592 | 1592,2 1593 | 1593,6 1594 | 1594,0 1595 | 1595,2 1596 | 1596,4 1597 | 1597,2 1598 | 1598,0 1599 | 1599,0 1600 | 1600,3 1601 | 1601,3 1602 | 1602,5 1603 | 1603,1 1604 | 1604,6 1605 | 1605,5 1606 | 1606,0 1607 | 1607,4 1608 | 1608,2 1609 | 1609,3 1610 | 1610,4 1611 | -------------------------------------------------------------------------------- /Kmeans/cluster_result_keyword.txt: -------------------------------------------------------------------------------- 1 | 1 game,england,match,france,player,football,team,french,paris,wembley,stadium,fan,night,friday,friendly,germany,play,fa,world,show,cup,time,security,tuesday,united,weekend,people,sport,good,day,win,solidarity,national,terrorist,attack,week,event,police,terror,happened,year,country,added,le,ground,victim,la,ve,de,hotel 2 | 2 police,security,terrorist,attack,intelligence,officer,terrorism,threat,government,uk,force,minister,corbyn,paris,labour,britain,secretary,british,leader,london,service,country,people,mi5,mp,defence,armed,party,cameron,home,information,response,call,official,bomb,terror,cut,military,suspect,time,support,kill,yesterday,shadow,european,group,policy,public,stop,year 3 | 3 london,day,people,world,week,year,city,market,time,thursday,attack,terrorists,yesterday,life,terror,g8,terrorist,event,business,olympic,blair,news,security,company,price,economic,school,financial,olympics,group,stock,bomb,attacks,today,home,leader,summit,street,africa,share,war,bank,child,change,fear,don,government,message,death,country 4 | 4 bus,bomb,london,train,police,people,tube,station,body,family,blast,bomber,work,leeds,yesterday,hospital,friend,street,told,road,home,thought,terrorist,officer,square,underground,king,cross,missing,time,man,bombing,explosion,attack,service,heard,passenger,explosive,suicide,house,woman,night,phone,day,victim,area,left,dead,life,hour 5 | 5 paris,french,france,attack,people,police,terrorist,belgian,bataclan,told,europe,night,refugee,syria,gunman,yesterday,friday,terror,abaaoud,attacks,family,killed,european,friend,life,border,brother,brussels,victim,suicide,abdeslam,concert,belgium,country,man,eu,year,died,isi,state,city,dead,world,shot,shooting,young,restaurant,passport,woman,hollande 6 | 6 isi,syria,isis,syrian,state,paris,attack,war,military,assad,cameron,group,president,force,refugee,western,strike,obama,terrorist,country,russia,russian,emwazi,raqqa,airstrikes,british,international,security,putin,drone,people,france,world,republican,iraq,islamic,west,american,support,europe,britain,fight,killing,bombing,ground,al,leader,french,bomb,action 7 | 7 muslim,britain,london,british,al,iraq,people,islamic,community,terrorism,attack,qaeda,islam,terror,bombing,war,terrorist,blair,terrorists,police,suicide,group,year,bomber,extremist,young,world,government,country,mosque,religious,act,time,west,religion,leader,yesterday,men,mp,home,man,political,street,bomb,life,afghanistan,east,thursday,islamist,minister 8 | -------------------------------------------------------------------------------- /Kmeans/k_select.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/TopicCluster/72f4f8131e27eddb232d1d62d3580d891cccf754/Kmeans/k_select.png -------------------------------------------------------------------------------- /Kmeans/kmeans_cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | import os,sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | from sklearn.cluster import KMeans 7 | from sklearn import feature_extraction 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from sklearn.feature_extraction.text import CountVectorizer 10 | 11 | '''vectorize the input documents''' 12 | def tfidf_vector(corpus_path): 13 | corpus_train=[] 14 | #利用train-corpus提取特征 15 | target_train=[] 16 | for line in open(corpus_path): 17 | line=line.strip().split('\t') 18 | if len(line)==2: 19 | words=line[1] 20 | category=line[0] 21 | target_train.append(category) 22 | corpus_train.append(words) 23 | print "build train-corpus done!!" 24 | count_v1= CountVectorizer(max_df=0.4,min_df=0.01) 25 | counts_train = count_v1.fit_transform(corpus_train) 26 | 27 | word_dict={} 28 | for index,word in enumerate(count_v1.get_feature_names()): 29 | word_dict[index]=word 30 | 31 | print "the shape of train is "+repr(counts_train.shape) 32 | tfidftransformer = TfidfTransformer() 33 | tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train) 34 | return tfidf_train,word_dict 35 | 36 | '''topic cluster''' 37 | def cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters):#K均值分类 38 | f_docs=open(cluster_docs,'w+') 39 | km = KMeans(n_clusters=num_clusters) 40 | km.fit(tfidf_train) 41 | clusters = km.labels_.tolist() 42 | cluster_dict={} 43 | order_centroids = km.cluster_centers_.argsort()[:, ::-1] 44 | doc=1 45 | for cluster in clusters: 46 | f_docs.write(str(str(doc))+','+str(cluster)+'\n') 47 | doc+=1 48 | if cluster not in cluster_dict: 49 | cluster_dict[cluster]=1 50 | else: 51 | cluster_dict[cluster]+=1 52 | f_docs.close() 53 | cluster=1 54 | 55 | f_clusterwords = open(cluster_keywords,'w+') 56 | for ind in order_centroids: # 每个聚类选 50 个词 57 | words=[] 58 | for index in ind[:50]: 59 | words.append(word_dict[index]) 60 | print cluster,','.join(words) 61 | f_clusterwords.write(str(cluster)+'\t'+','.join(words)+'\n') 62 | cluster+=1 63 | print '*****'*5 64 | f_clusterwords.close() 65 | 66 | '''select the best cluster num''' 67 | def best_kmeans(tfidf_matrix,word_dict): 68 | import matplotlib.pyplot as plt 69 | from matplotlib.font_manager import FontProperties 70 | from sklearn.cluster import KMeans 71 | from scipy.spatial.distance import cdist 72 | import numpy as np 73 | K = range(1, 10) 74 | meandistortions = [] 75 | for k in K: 76 | print k,'****'*5 77 | kmeans = KMeans(n_clusters=k) 78 | kmeans.fit(tfidf_matrix) 79 | meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0]) 80 | plt.plot(K, meandistortions, 'bx-') 81 | plt.grid(True) 82 | plt.xlabel('Number of clusters') 83 | plt.ylabel('Average within-cluster sum of squares') 84 | plt.title('Elbow for Kmeans clustering') 85 | plt.show() 86 | 87 | if __name__=='__main__': 88 | corpus_train = "./corpus_train.txt" 89 | cluster_docs = "./cluster_result_document.txt" 90 | cluster_keywords = "./cluster_result_keyword.txt" 91 | num_clusters = 7 92 | tfidf_train,word_dict=tfidf_vector(corpus_train) 93 | best_kmeans(tfidf_train,word_dict) 94 | cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters) 95 | -------------------------------------------------------------------------------- /Lda/cluster_keywords_lda.txt: -------------------------------------------------------------------------------- 1 | 0 "attack","terrorist","syria","people","london","paris","police","france","security","yesterday","place","isi","group","week","blair","western","eu","british","minister","east","year","terrorism","cameron","country","bomb","french","g8","service","terrorists","muslim","street","intelligence","town","day","al","military","news","show","fan","west","border","city","tuesday","mp","life","long","stop","europe","night","russia","set","told","president" 2 | 1 "police","people","british","terrorist","attack","london","paris","isi","officer","france","leader","force","security","bomb","britain","game","french","world","day","islamic","muslim","syrian","country","syria","year","bomber","war","bombing","terrorists","government","action","isis","public","night","terror","friday","play","threat","live","road","terrorism","men","killing","cameron","qaeda","west","raid","suicide","president","iraq","time","assad","victim" 3 | 2 "police","paris","attack","security","french","people","france","time","syria","government","terrorist","terrorism","yesterday","muslim","european","support","year","isi","kill","london","group","place","home","border","extremist","europe","british","cut","britain","force","told","power","bombing","night","uk","shot","community","city","minister","dead","country","islamic","game","gunman","response","refugee","terror","cameron","war","capital","concert","shoot","family" 4 | 3 "london","british","time","leeds","muslim","yesterday","people","life","paris","world","attack","terrorist","play","country","home","leader","bomb","terror","bomber","french","act","family","le","government","police","england","bombing","team","young","city","day","captain","community","pakistan","injured","happened","religion","game","player","britain","syria","year","turn","labour","strike","sense","france","student","thought","security","isi","change","group" 5 | 4 "london","attack","terrorist","world","suicide","british","french","yesterday","terror","paris","muslim","people","al","state","security","iraq","france","victim","home","police","group","britain","told","year","support","military","isi","woman","terrorism","islamic","work","europe","family","claim","bus","bomb","heard","blast","law","blood","time","war","city","game","service","son","bombing","left","man","country","syria","week","border" 6 | 5 "britain","london","attack","people","terrorist","paris","war","blair","police","time","terrorists","yesterday","security","country","muslim","french","life","world","home","family","city","isi","told","bomb","france","afghanistan","iraq","young","page","man","refugee","team","game","day","government","year","bomber","week","sport","minister","put","islam","school","ve","uk","child","foreign","south","terror","number","official","political","message" 7 | 6 "london","muslim","people","man","paris","home","country","year","terror","terrorist","police","public","attack","war","killed","terrorism","bomb","family","world","work","british","yesterday","officer","britain","prayer","life","friend","bus","death","force","solidarity","act","fear","blood","refugee","service","america","newspaper","blair","station","time","city","page","terrorists","bomber","victim","word","report","money","league","carriage","local","place" 8 | 7 "paris","state","attack","isi","france","people","terror","terrorist","world","police","britain","french","time","year","bomb","death","muslim","london","syria","yesterday","bus","power","told","islamic","war","country","security","president","suicide","fear","british","left","policy","tube","community","attacks","military","response","refugee","station","leader","bombing","force","syrian","heard","victim","friend","night","player","western","train","bomber","friday" 9 | 8 "france","french","paris","people","attack","night","london","bomb","political","bombing","england","country","terrorist","stadium","president","year","world","terror","city","isis","state","police","united","security","national","isi","match","germany","muslim","game","week","group","syrian","friday","football","day","part","community","terrorists","yesterday","team","crime","government","killed","islamic","family","secretary","minister","european","syria","war","meeting","form" 10 | 9 "french","paris","police","attack","intelligence","france","terrorist","terror","people","london","bomb","al","european","leader","security","british","syria","minister","night","abaaoud","britain","war","world","isi","death","attacks","day","belgium","belgian","service","car","train","suicide","iraq","yesterday","brussels","including","group","terrorists","muslim","killed","scotland","time","bomber","family","city","country","border","terrorism","qaeda","football","tube","victim" 11 | 10 "time","attack","people","security","paris","britain","terrorist","london","isi","day","police","attacks","weapon","cameron","france","french","work","city","group","war","world","terror","syria","face","national","terrorism","bombing","leader","europe","life","night","intelligence","part","uk","hope","state","country","corbyn","bullet","armed","government","officer","terrorists","dead","british","expert","killed","stop","qaeda","middle","al","bomb","muslim" 12 | -------------------------------------------------------------------------------- /Lda/cluster_keywords_lsi.txt: -------------------------------------------------------------------------------- 1 | 0 "attack","paris","france","police","french","terrorist","isi","people","syria","security","london","muslim","britain","country","british","terror","state","bomb","war","world","yesterday","year","time","europe","night","terrorism","islamic","president","bombing","leader","bomber","government","group","terrorists","suicide","told","city","iraq","attacks","isis","intelligence","home","minister","al","cameron","man","life","game","force","day" 2 | 1 "game","england","football","france","stadium","match","french","player","wembley","team","fan","night","friday","isi","syria","play","isis","war","show","britain","friendly","cameron","fa","military","iraq","united","muslim","germany","assad","british","paris","syrian","western","tuesday","force","bombing","al","hodgson","state","group","islamic","russia","le","leader","west","event","solidarity","government","mp","deschamps" 3 | 2 "syria","isi","london","bus","bomb","bomber","france","isis","president","french","train","assad","leeds","syrian","paris","cameron","state","station","police","europe","russia","tube","military","eu","officer","man","family","home","work","people","border","refugee","british","muslim","russian","yesterday","blast","friend","putin","thursday","street","european","war","country","men","road","young","community","terrorists","force" 4 | 3 "belgian","abaaoud","brussels","belgium","police","brother","blair","game","suicide","world","abdeslam","leader","england","french","iraq","man","syria","told","raid","football","britain","hollande","bataclan","europe","eu","wembley","london","molenbeek","war","match","gunman","european","paris","day","bombing","government","isi","player","support","terrorists","political","labour","arrested","family","border","men","team","mp","france","united" 5 | 4 "security","european","friend","border","bataclan","intelligence","eu","isis","europe","minister","terrorism","shooting","killed","police","told","family","blood","dead","death","belgian","concert","assad","isi","terrorist","government","life","service","national","abaaoud","state","gunman","war","belgium","uk","brussels","officer","killing","young","victim","britain","son","strike","attacks","west","shot","syria","passport","raqqa","restaurant","man" 6 | 5 "bomb","muslim","leeds","cameron","bomber","train","islamic","bus","station","assad","russia","world","officer","luton","terrorism","life","putin","syria","syrian","police","military","russian","terror","president","isis","islam","bataclan","road","people","terrorists","team","french","night","house","tube","state","explosive","labour","mp","act","iraq","al","aldgate","blast","death","match","paris","believed","airstrikes","corbyn" 7 | 6 "muslim","al","islamic","bomber","british","leeds","suicide","young","britain","market","shooting","dead","bataclan","community","men","iraq","city","people","qaeda","islam","england","emergency","game","concert","wembley","heard","blood","told","european","corbyn","bus","street","football","day","labour","united","gunman","share","system","group","restaurant","extremist","mosque","luton","west","thursday","player","mp","woman","team" 8 | 7 "labour","corbyn","mp","kill","syrian","president","refugee","party","europe","police","shoot","secretary","threat","eu","shadow","emwazi","support","uk","assad","state","action","leader","european","border","putin","russia","armed","bus","tube","world","train","russian","station","shooting","obama","intelligence","market","isis","policy","west","night","bomb","work","london","schengen","jeremy","mi5","vote","country","strike" 9 | 8 "market","refugee","isi","group","muslim","city","border","europe","country","community","killed","year","share","stock","company","leader","eu","al","syria","raqqa","oil","belgian","belgium","price","brother","leeds","strike","islam","man","qaeda","isis","abaaoud","people","thursday","attacks","trading","emwazi","drone","passport","brussels","financial","national","schengen","intelligence","syrian","migrant","religious","terror","business","investor" 10 | 9 "family","leeds","year","house","market","al","tube","friend","qaeda","hollande","body","community","bus","home","young","service","area","train","station","school","road","state","day","mosque","security","share","tanweer","leader","bomb","syrian","brother","gunman","time","stock","britain","hasib","father","refugee","price","western","labour","night","life","corbyn","child","restaurant","company","france","bombing","beeston" 11 | 10 "g8","summit","blair","belgian","president","leader","brussels","europe","belgium","group","brother","abdeslam","africa","border","refugee","scotland","world","assad","molenbeek","poverty","putin","policy","russian","market","abaaoud","emwazi","bush","strike","passport","schengen","european","russia","british","official","political","meeting","islamic","change","share","gleneagles","syria","germany","stock","migrant","scottish","eu","attacks","american","yesterday","system" 12 | -------------------------------------------------------------------------------- /Lda/lda_cluster.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import os,sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf-8') 5 | from gensim.models import LdaModel,TfidfModel,LsiModel 6 | from gensim import similarities 7 | from gensim import corpora 8 | 9 | def create_data(corpus_path):#构建数据,先后使用doc2bow和tfidf model对文本进行向量表示 10 | sentences = [] 11 | sentence_dict={} 12 | count=0 13 | for line in open(corpus_path): 14 | # print line 15 | line = line.strip().split('\t') 16 | if len(line) == 2: 17 | sentence_dict[count]=line[1] 18 | count+=1 19 | sentences.append(line[1].split(' ')) 20 | else: 21 | break 22 | #对文本进行处理,得到文本集合中的词表 23 | dictionary = corpora.Dictionary(sentences) 24 | #利用词表,对文本进行cbow表示 25 | corpus = [dictionary.doc2bow(text) for text in sentences] 26 | #利用cbow,对文本进行tfidf表示 27 | tfidf=TfidfModel(corpus) 28 | corpus_tfidf=tfidf[corpus] 29 | return sentence_dict,dictionary,corpus,corpus_tfidf 30 | 31 | def lda_model(sentence_dict,dictionary,corpus,corpus_tfidf,cluster_keyword_lda):#使用lda模型,获取主题分布 32 | lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11) 33 | f_keyword = open(cluster_keyword_lda, 'w+') 34 | for topic in lda.print_topics(11,53): 35 | print '****'*5 36 | words=[] 37 | for word in topic[1].split('+'): 38 | word=word.split('*')[1].replace(' ','') 39 | words.append(word) 40 | f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n') 41 | #利用lsi模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目 42 | corpus_lda = lda[corpus_tfidf] 43 | for doc in corpus_lda: 44 | print len(doc),doc 45 | return lda 46 | 47 | def lsi_model(sentence_dict,dictionary,corpus,corpus_tfidf,cluster_keyword_lsi):#使用lsi模型,获取主题分布 48 | lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11) 49 | f_keyword = open(cluster_keyword_lsi, 'w+') 50 | for topic in lsi.print_topics(11,50): 51 | print topic[0] 52 | words=[] 53 | for word in topic[1].split('+'): 54 | word=word.split('*')[1].replace(' ','') 55 | words.append(word) 56 | f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n') 57 | 58 | return lsi 59 | 60 | 61 | if __name__=="__main__": 62 | corpus_path = "./corpus_train.txt" 63 | cluster_keyword_lda = './cluster_keywords_lda.txt' 64 | cluster_keyword_lsi = './cluster_keywords_lsi.txt' 65 | sentence_dict,dictionary,corpus,corpus_tfidf=create_data(corpus_path) 66 | lsi_model(sentence_dict,dictionary,corpus,corpus_tfidf,cluster_keyword_lsi) 67 | lda_model(sentence_dict, dictionary, corpus, corpus_tfidf,cluster_keyword_lda) 68 | 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TopiCluster 2 | 基于Kmeans与Lda模型的多文档主题聚类,输入多篇文档,输出每个主题的关键词与相应文本,可用于主题发现与热点分析 3 | If any question about the project or me ,see https://liuhuanyong.github.io/ 4 | 5 | 6 | 如有自然语言处理、知识图谱、事理图谱、社会计算、语言资源建设等问题或合作,可联系我: 7 | 1、我的github项目介绍:https://liuhuanyong.github.io 8 | 2、我的csdn博客:https://blog.csdn.net/lhy2014 9 | 3、about me:刘焕勇,中国科学院软件研究所,lhy_in_blcu@126.com 10 | --------------------------------------------------------------------------------