├── README ├── data ├── stopwordlist.txt └── test.csv └── src ├── alphabet.py ├── corpus.py ├── experiment.py ├── interactive_plot.py ├── lda.py ├── postprocess.py ├── preprocess.py └── utils.py /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hannawallach/python-lda/1bb1b89fc8e7969e0f4811ffd6b1d873f4e4a41e/README -------------------------------------------------------------------------------- /data/stopwordlist.txt: -------------------------------------------------------------------------------- 1 | * 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | a 13 | a 14 | aacute 15 | abbrev 16 | able 17 | about 18 | about 19 | above 20 | above 21 | ac 22 | according 23 | accordingly 24 | accordingly 25 | acirc 26 | acronym 27 | across 28 | across 29 | actually 30 | acute 31 | address 32 | aelig 33 | after 34 | after 35 | afterwards 36 | afterwards 37 | again 38 | again 39 | against 40 | against 41 | agrave 42 | al. 43 | al 44 | alefsym 45 | all 46 | all 47 | allow 48 | allows 49 | allows 50 | almost 51 | almost 52 | alone 53 | alone 54 | along 55 | along 56 | alpha 57 | already 58 | already 59 | also 60 | also 61 | alt 62 | although 63 | although 64 | always 65 | always 66 | am 67 | am 68 | among 69 | among 70 | amongst 71 | amongst 72 | amp 73 | an 74 | an 75 | and 76 | and 77 | and 78 | ang 79 | another 80 | another 81 | any 82 | any 83 | anybody 84 | anybody 85 | anyhow 86 | anyhow 87 | anyone 88 | anyone 89 | anything 90 | anything 91 | anyway 92 | anyways 93 | anywhere 94 | anywhere 95 | apart 96 | apart 97 | appear 98 | appear 99 | applet 100 | appreciate 101 | appropriate 102 | appropriate 103 | are 104 | are 105 | area 106 | aring 107 | around 108 | around 109 | as 110 | as 111 | aside 112 | aside 113 | ask 114 | asking 115 | associated 116 | associated 117 | asymp 118 | at 119 | at 120 | atilde 121 | au 122 | auml 123 | author 124 | available 125 | available 126 | avi 127 | away 128 | away 129 | awfully 130 | awfully 131 | b 132 | b 133 | back 134 | banner 135 | base 136 | basefont 137 | bdquo 138 | be 139 | be 140 | became 141 | became 142 | because 143 | because 144 | become 145 | become 146 | becomes 147 | becomes 148 | becoming 149 | becoming 150 | been 151 | been 152 | before 153 | before 154 | beforehand 155 | beforehand 156 | behind 157 | behind 158 | being 159 | being 160 | believe 161 | below 162 | below 163 | beside 164 | beside 165 | besides 166 | besides 167 | best 168 | best 169 | beta 170 | better 171 | better 172 | between 173 | between 174 | beyond 175 | beyond 176 | bgsound 177 | big 178 | blink 179 | blockquote 180 | body 181 | both 182 | both 183 | bq 184 | br 185 | brief 186 | brief 187 | brvbar 188 | bull 189 | but 190 | but 191 | by 192 | by 193 | c 194 | call 195 | came 196 | came 197 | can 198 | can 199 | cannot 200 | cannot 201 | cant 202 | cant 203 | cap 204 | caption 205 | cause 206 | cause 207 | causes 208 | causes 209 | ccedil 210 | cedil 211 | cent 212 | center 213 | certain 214 | certain 215 | certainly 216 | cfm 217 | changes 218 | changes 219 | chi 220 | circ 221 | cite 222 | clearly 223 | clubs 224 | co 225 | co 226 | code 227 | col 228 | colgroup 229 | com 230 | com 231 | come 232 | come 233 | comes 234 | concerning 235 | cong 236 | consequently 237 | consequently 238 | consider 239 | considering 240 | contain 241 | contain 242 | containing 243 | containing 244 | contains 245 | contains 246 | copy 247 | corresponding 248 | corresponding 249 | could 250 | could 251 | course 252 | crarr 253 | credit 254 | cup 255 | curren 256 | currently 257 | currently 258 | d 259 | dagger 260 | darr 261 | day 262 | dd 263 | definitely 264 | deg 265 | del 266 | delta 267 | described 268 | described 269 | despite 270 | dfn 271 | diams 272 | did 273 | did 274 | different 275 | different 276 | dir 277 | div 278 | divide 279 | dl 280 | do 281 | do 282 | doc 283 | does 284 | does 285 | doing 286 | doing 287 | don 288 | done 289 | done 290 | don't 291 | down 292 | down 293 | downwards 294 | downwards 295 | dt 296 | during 297 | during 298 | e 299 | each 300 | each 301 | eacute 302 | ecirc 303 | edu 304 | edu 305 | eg 306 | e.g 307 | eg 308 | egrave 309 | eight 310 | eight 311 | either 312 | either 313 | else 314 | else 315 | elsewhere 316 | elsewhere 317 | em 318 | embed 319 | empty 320 | emsp 321 | enough 322 | enough 323 | ensp 324 | entirely 325 | epsilon 326 | eq 327 | equiv 328 | especially 329 | et 330 | et 331 | eta 332 | etc 333 | etc 334 | eth 335 | euml 336 | euro 337 | even 338 | even 339 | ever 340 | ever 341 | every 342 | every 343 | everybody 344 | everybody 345 | everyone 346 | everyone 347 | everything 348 | everything 349 | everywhere 350 | everywhere 351 | ex 352 | ex 353 | exactly 354 | example 355 | example 356 | except 357 | except 358 | exist 359 | f 360 | far 361 | far 362 | few 363 | few 364 | fifth 365 | fifth 366 | fig 367 | first 368 | first 369 | five 370 | five 371 | fn 372 | fnof 373 | followed 374 | followed 375 | following 376 | following 377 | follows 378 | font 379 | for 380 | for 381 | forall 382 | form 383 | former 384 | former 385 | formerly 386 | formerly 387 | forth 388 | forth 389 | four 390 | four 391 | frac12 392 | frac14 393 | frac34 394 | frame 395 | frameset 396 | frasl 397 | from 398 | from 399 | ftp 400 | further 401 | further 402 | furthermore 403 | furthermore 404 | g 405 | gamma 406 | ge 407 | get 408 | get 409 | gets 410 | gets 411 | getting 412 | gif 413 | given 414 | given 415 | gives 416 | gives 417 | go 418 | go 419 | goes 420 | going 421 | gone 422 | gone 423 | good 424 | got 425 | got 426 | gotten 427 | great 428 | greetings 429 | gt 430 | gz 431 | h 432 | h1 433 | h2 434 | h3 435 | h4 436 | h5 437 | h6 438 | had 439 | had 440 | happens 441 | hardly 442 | hardly 443 | harr 444 | has 445 | has 446 | have 447 | have 448 | having 449 | having 450 | he 451 | he 452 | head 453 | hear 454 | hearts 455 | hellip 456 | hello 457 | help 458 | hence 459 | hence 460 | her 461 | her 462 | here 463 | here 464 | hereafter 465 | hereafter 466 | hereby 467 | hereby 468 | herein 469 | herein 470 | hereupon 471 | hereupon 472 | hers 473 | hers 474 | herself 475 | herself 476 | hi 477 | him 478 | him 479 | himself 480 | himself 481 | his 482 | his 483 | hither 484 | hither 485 | hopefully 486 | how 487 | how 488 | howbeit 489 | howbeit 490 | however 491 | however 492 | hr 493 | href 494 | htm 495 | html 496 | http 497 | i 498 | i 499 | iacute 500 | icirc 501 | ie 502 | i.e 503 | ie 504 | iexcl 505 | if 506 | if 507 | iframe 508 | ignored 509 | ignored 510 | igrave 511 | i'll 512 | image 513 | img 514 | immediate 515 | immediate 516 | in 517 | in 518 | inasmuch 519 | inasmuch 520 | inc 521 | inc 522 | indeed 523 | indeed 524 | indicate 525 | indicate 526 | indicated 527 | indicated 528 | indicates 529 | indicates 530 | infin 531 | inner 532 | inner 533 | input 534 | ins 535 | insofar 536 | insofar 537 | instead 538 | instead 539 | int 540 | into 541 | into 542 | inward 543 | inward 544 | iota 545 | iquest 546 | is 547 | is 548 | isin 549 | isindex 550 | it 551 | it 552 | its 553 | its 554 | itself 555 | itself 556 | iuml 557 | i've 558 | j 559 | jpeg 560 | jpg 561 | just 562 | just 563 | k 564 | kappa 565 | kbd 566 | keep 567 | keep 568 | keeps 569 | kept 570 | kept 571 | know 572 | know 573 | known 574 | knows 575 | l 576 | lambda 577 | lang 578 | lang 579 | laquo 580 | larr 581 | last 582 | last 583 | lately 584 | later 585 | latter 586 | latter 587 | latterly 588 | latterly 589 | lceil 590 | ldquo 591 | le 592 | least 593 | least 594 | less 595 | less 596 | lest 597 | lest 598 | let 599 | let 600 | lfloor 601 | lh 602 | li 603 | life 604 | like 605 | like 606 | liked 607 | likely 608 | link 609 | listing 610 | little 611 | little 612 | ll 613 | long 614 | look 615 | looking 616 | looks 617 | lowast 618 | loz 619 | lrm 620 | lsaquo 621 | lsquo 622 | lt 623 | ltd 624 | ltd 625 | m 626 | macr 627 | made 628 | mailto 629 | mainly 630 | make 631 | man 632 | many 633 | many 634 | map 635 | marquee 636 | math 637 | may 638 | may 639 | maybe 640 | mdash 641 | me 642 | me 643 | mean 644 | meanwhile 645 | meanwhile 646 | men 647 | menu 648 | merely 649 | meta 650 | micro 651 | middot 652 | might 653 | might 654 | minus 655 | more 656 | more 657 | moreover 658 | moreover 659 | most 660 | most 661 | mostly 662 | mostly 663 | mpeg 664 | mpg 665 | mr 666 | mu 667 | much 668 | much 669 | multicol 670 | must 671 | must 672 | my 673 | my 674 | myself 675 | myself 676 | n 677 | nabla 678 | name 679 | name 680 | namely 681 | namely 682 | nbsp 683 | nd 684 | ndash 685 | ne 686 | near 687 | near 688 | nearly 689 | necessary 690 | necessary 691 | need 692 | needs 693 | neither 694 | neither 695 | never 696 | never 697 | nevertheless 698 | nevertheless 699 | new 700 | new 701 | next 702 | next 703 | ni 704 | nine 705 | nine 706 | no 707 | no 708 | nobody 709 | nobody 710 | nobr 711 | noframes 712 | non 713 | none 714 | none 715 | noone 716 | noone 717 | nor 718 | nor 719 | normally 720 | normally 721 | not 722 | not 723 | not 724 | note 725 | nothing 726 | nothing 727 | notin 728 | novel 729 | novel 730 | now 731 | now 732 | nowhere 733 | nowhere 734 | nsub 735 | ntilde 736 | nu 737 | o 738 | oacute 739 | obviously 740 | ocirc 741 | oelig 742 | of 743 | of 744 | off 745 | off 746 | often 747 | often 748 | ograve 749 | oh 750 | oh 751 | ok 752 | okay 753 | ol 754 | old 755 | old 756 | oline 757 | omega 758 | omicron 759 | on 760 | on 761 | once 762 | once 763 | one 764 | one 765 | ones 766 | ones 767 | only 768 | only 769 | onto 770 | onto 771 | oplus 772 | or 773 | or 774 | or 775 | ordf 776 | ordm 777 | oslash 778 | other 779 | other 780 | others 781 | others 782 | otherwise 783 | otherwise 784 | otilde 785 | otimes 786 | ought 787 | ought 788 | ouml 789 | our 790 | our 791 | ours 792 | ours 793 | ourselves 794 | ourselves 795 | out 796 | out 797 | outside 798 | outside 799 | over 800 | over 801 | overall 802 | overall 803 | overlay 804 | own 805 | own 806 | p 807 | p 808 | para 809 | param 810 | part 811 | particular 812 | particular 813 | particularly 814 | particularly 815 | pdf 816 | people 817 | per 818 | per 819 | perhaps 820 | perhaps 821 | permil 822 | perp 823 | person 824 | phi 825 | php 826 | pi 827 | piv 828 | placed 829 | placed 830 | plaintext 831 | please 832 | please 833 | plus 834 | plus 835 | plusmn 836 | possible 837 | possible 838 | pound 839 | ppt 840 | pre 841 | presumably 842 | prime 843 | probably 844 | probably 845 | prod 846 | prop 847 | provides 848 | provides 849 | ps 850 | psi 851 | q 852 | q 853 | que 854 | que 855 | quite 856 | quite 857 | quot 858 | qv 859 | r 860 | radic 861 | rang 862 | range 863 | raquo 864 | rarr 865 | rather 866 | rather 867 | rceil 868 | rd 869 | rdquo 870 | re 871 | real 872 | really 873 | really 874 | reasonably 875 | reg 876 | regarding 877 | regardless 878 | regards 879 | relatively 880 | relatively 881 | respectively 882 | respectively 883 | rfloor 884 | rho 885 | right 886 | right 887 | rlm 888 | rsaquo 889 | rsquo 890 | s 891 | said 892 | said 893 | same 894 | same 895 | samp 896 | saw 897 | say 898 | saying 899 | says 900 | sbquo 901 | scaron 902 | script 903 | sdot 904 | second 905 | second 906 | secondly 907 | secondly 908 | sect 909 | see 910 | see 911 | seeing 912 | seem 913 | seem 914 | seemed 915 | seemed 916 | seeming 917 | seeming 918 | seems 919 | seems 920 | seen 921 | select 922 | self 923 | self 924 | selves 925 | selves 926 | sensible 927 | sensible 928 | sent 929 | sent 930 | serious 931 | serious 932 | seriously 933 | seven 934 | seven 935 | several 936 | several 937 | shall 938 | shall 939 | she 940 | she 941 | should 942 | should 943 | shy 944 | sigma 945 | sigmaf 946 | sim 947 | since 948 | since 949 | six 950 | six 951 | small 952 | so 953 | so 954 | some 955 | some 956 | somebody 957 | somebody 958 | somehow 959 | somehow 960 | someone 961 | someone 962 | something 963 | something 964 | sometime 965 | sometime 966 | sometimes 967 | sometimes 968 | somewhat 969 | somewhat 970 | somewhere 971 | somewhere 972 | soon 973 | sorry 974 | spacer 975 | spades 976 | specified 977 | specified 978 | specify 979 | specify 980 | specifying 981 | specifying 982 | spot 983 | state 984 | still 985 | still 986 | strike 987 | strong 988 | sub 989 | sub 990 | sub 991 | sub 992 | sube 993 | such 994 | such 995 | sum 996 | sup 997 | sup 998 | sup 999 | sup 1000 | sup1 1001 | sup2 1002 | sup3 1003 | supe 1004 | sure 1005 | szlig 1006 | t 1007 | tab 1008 | table 1009 | take 1010 | take 1011 | taken 1012 | taken 1013 | talk 1014 | tar 1015 | tau 1016 | tbody 1017 | td 1018 | tell 1019 | tends 1020 | textarea 1021 | textflow 1022 | tfoot 1023 | th 1024 | th 1025 | than 1026 | than 1027 | thank 1028 | thanks 1029 | thanx 1030 | that 1031 | that 1032 | thats 1033 | the 1034 | the 1035 | thead 1036 | their 1037 | their 1038 | theirs 1039 | theirs 1040 | them 1041 | them 1042 | themselves 1043 | themselves 1044 | then 1045 | then 1046 | thence 1047 | thence 1048 | there 1049 | there 1050 | there4 1051 | thereafter 1052 | thereafter 1053 | thereby 1054 | thereby 1055 | therefore 1056 | therefore 1057 | therein 1058 | therein 1059 | theres 1060 | thereupon 1061 | thereupon 1062 | these 1063 | these 1064 | theta 1065 | thetasym 1066 | they 1067 | they 1068 | think 1069 | thinsp 1070 | third 1071 | third 1072 | this 1073 | this 1074 | thorn 1075 | thorough 1076 | thorough 1077 | thoroughly 1078 | thoroughly 1079 | those 1080 | those 1081 | though 1082 | though 1083 | three 1084 | three 1085 | through 1086 | through 1087 | throughout 1088 | throughout 1089 | thru 1090 | thru 1091 | thus 1092 | thus 1093 | tilde 1094 | time 1095 | times 1096 | title 1097 | to 1098 | to 1099 | together 1100 | together 1101 | too 1102 | too 1103 | took 1104 | toward 1105 | toward 1106 | towards 1107 | towards 1108 | tr 1109 | trade 1110 | tried 1111 | tries 1112 | truly 1113 | try 1114 | trying 1115 | tt 1116 | twice 1117 | twice 1118 | two 1119 | two 1120 | u 1121 | u 1122 | uacute 1123 | uarr 1124 | ucirc 1125 | ugrave 1126 | ul 1127 | uml 1128 | un 1129 | under 1130 | under 1131 | unfortunately 1132 | unless 1133 | unless 1134 | unlikely 1135 | until 1136 | until 1137 | unto 1138 | unto 1139 | up 1140 | up 1141 | upon 1142 | upon 1143 | upsih 1144 | upsilon 1145 | url 1146 | us 1147 | us 1148 | use 1149 | use 1150 | used 1151 | used 1152 | useful 1153 | useful 1154 | uses 1155 | uses 1156 | using 1157 | using 1158 | usually 1159 | usually 1160 | uucp 1161 | uuml 1162 | v 1163 | value 1164 | value 1165 | var 1166 | various 1167 | various 1168 | ve 1169 | very 1170 | very 1171 | via 1172 | via 1173 | viz 1174 | viz 1175 | vs 1176 | vs 1177 | w 1178 | want 1179 | wants 1180 | was 1181 | was 1182 | way 1183 | way 1184 | wbr 1185 | we 1186 | we 1187 | weierp 1188 | welcome 1189 | well 1190 | well 1191 | went 1192 | went 1193 | were 1194 | were 1195 | what 1196 | what 1197 | whatever 1198 | whatever 1199 | when 1200 | when 1201 | whence 1202 | whence 1203 | whenever 1204 | whenever 1205 | where 1206 | where 1207 | whereafter 1208 | whereafter 1209 | whereas 1210 | whereas 1211 | whereby 1212 | whereby 1213 | wherein 1214 | wherein 1215 | whereupon 1216 | whereupon 1217 | wherever 1218 | wherever 1219 | whether 1220 | whether 1221 | which 1222 | which 1223 | while 1224 | while 1225 | whither 1226 | whither 1227 | who 1228 | who 1229 | whoever 1230 | whoever 1231 | whole 1232 | whole 1233 | whom 1234 | whom 1235 | whose 1236 | whose 1237 | why 1238 | why 1239 | will 1240 | will 1241 | willing 1242 | wish 1243 | with 1244 | with 1245 | within 1246 | within 1247 | without 1248 | without 1249 | wonder 1250 | work 1251 | world 1252 | would 1253 | would 1254 | would 1255 | www 1256 | x 1257 | xi 1258 | xmp 1259 | y 1260 | yacute 1261 | year 1262 | years 1263 | yen 1264 | yes 1265 | yet 1266 | yet 1267 | you 1268 | you 1269 | your 1270 | your 1271 | yours 1272 | yours 1273 | yourself 1274 | yourself 1275 | yourselves 1276 | yourselves 1277 | yuml 1278 | z 1279 | zero 1280 | zero 1281 | zeta 1282 | zip 1283 | zwj 1284 | zwnj 1285 | preferably 1286 | -------------------------------------------------------------------------------- /data/test.csv: -------------------------------------------------------------------------------- 1 | XXX XXX cat dog apple cat 2 | YYY YYY dog dog fox apple frog 3 | -------------------------------------------------------------------------------- /src/alphabet.py: -------------------------------------------------------------------------------- 1 | class Alphabet(object): 2 | 3 | def __init__(self): 4 | 5 | self._mapping = {} # mapping from strings to integers 6 | self._reverse = {} # reverse mapping from integers to strings 7 | 8 | self._idx = 0 9 | self._growing = True 10 | 11 | def stop_growth(self): 12 | self._growing = False 13 | 14 | def lookup(self, i): 15 | 16 | assert isinstance(i, int) 17 | return self._reverse[i] 18 | 19 | def plaintext(self): 20 | 21 | contents = self._reverse.items() 22 | contents.sort(key=lambda x: x[0]) 23 | 24 | return '\n'.join('%s\t%s' % (i, s) for i, s in contents) 25 | 26 | def __contains__(self, s): 27 | 28 | assert isinstance(s, basestring) 29 | return s in self._mapping 30 | 31 | def __getitem__(self, s): 32 | 33 | try: 34 | return self._mapping[s] 35 | except KeyError: 36 | if not isinstance(s, basestring): 37 | raise ValueError('Invalid key (%s): must be a string.' % (s,)) 38 | if not self._growing: 39 | return None 40 | i = self._mapping[s] = self._idx 41 | self._reverse[i] = s 42 | self._idx += 1 43 | return i 44 | 45 | add = __getitem__ 46 | 47 | def __iter__(self): 48 | 49 | for i in xrange(len(self)): 50 | yield self._reverse[i] 51 | 52 | def __len__(self): 53 | return len(self._mapping) 54 | -------------------------------------------------------------------------------- /src/corpus.py: -------------------------------------------------------------------------------- 1 | from alphabet import * 2 | from numpy import array, ndarray 3 | import cPickle as pickle 4 | 5 | 6 | class Document(object): 7 | 8 | def __init__(self, corpus, name, tokens): 9 | 10 | assert isinstance(corpus, Corpus) 11 | assert isinstance(name, basestring) 12 | assert isinstance(tokens, ndarray) 13 | 14 | self.corpus = corpus 15 | self.name = name 16 | self.tokens = tokens 17 | 18 | def __len__(self): 19 | return len(self.tokens) 20 | 21 | def plaintext(self): 22 | return ' '.join([self.corpus.alphabet.lookup(x) for x in self.tokens]) 23 | 24 | 25 | class Corpus(object): 26 | 27 | def __init__(self): 28 | 29 | self.documents = [] 30 | self.alphabet = Alphabet() 31 | 32 | def add(self, name, data): 33 | 34 | assert isinstance(name, basestring) 35 | assert isinstance(data, list) 36 | 37 | tokens = array([self.alphabet[x] for x in data]) 38 | self.documents.append(Document(self, name, tokens)) 39 | 40 | def __iter__(self): 41 | return iter(self.documents) 42 | 43 | def __len__(self): 44 | return len(self.documents) 45 | 46 | @classmethod 47 | def load(cls, filename): 48 | return pickle.load(file(filename, 'r')) 49 | 50 | def save(self, filename): 51 | pickle.dump(self, file(filename, 'wb')) 52 | -------------------------------------------------------------------------------- /src/experiment.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from corpus import * 3 | from lda import * 4 | 5 | 6 | def main(): 7 | 8 | # parse command-line arguments 9 | 10 | p = ArgumentParser() 11 | 12 | p.add_argument('input_file', metavar='input-file', help='file containing preprocessed data') 13 | p.add_argument('output_dir', metavar='output-dir', help='output directory') 14 | p.add_argument('-T', metavar='num-topics', type=int, default=100, help='number of topics (default: 100)') 15 | p.add_argument('-S', metavar='num-iterations', type=int, default=1000, help='number of Gibbs sampling iterations (default: 1000)') 16 | p.add_argument('--optimize', action='store_true', help='optimize Dirichlet hyperparameters') 17 | 18 | args = p.parse_args() 19 | 20 | corpus = Corpus.load(args.input_file) 21 | 22 | lda = LDA(corpus, args.T, args.S, args.optimize, args.output_dir) 23 | 24 | lda.inference() 25 | 26 | 27 | if __name__ == '__main__': 28 | 29 | # import cProfile 30 | # cProfile.run('main()', 'lda_profile') 31 | 32 | main() 33 | -------------------------------------------------------------------------------- /src/interactive_plot.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot 2 | 3 | 4 | class InteractivePlot(object): 5 | 6 | def __init__(self, xlabel, ylabel): 7 | 8 | pyplot.ion() 9 | 10 | self.xlabel = xlabel 11 | self.ylabel = ylabel 12 | 13 | self.x = [] 14 | self.y = [] 15 | 16 | def update_plot(self, xval, yval): 17 | 18 | x = self.x 19 | y = self.y 20 | 21 | x.append(xval) 22 | y.append(yval) 23 | 24 | pyplot.clf() 25 | 26 | pyplot.plot(x, y, 'k') 27 | 28 | pyplot.xlabel(self.xlabel) 29 | pyplot.ylabel(self.ylabel) 30 | 31 | pyplot.draw() 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | import time, math 37 | 38 | plt = InteractivePlot('x', 'y') 39 | 40 | for x in xrange(100): 41 | 42 | plt.update_plot(x, x * math.sin(0.4 * x)) 43 | time.sleep(.01) 44 | -------------------------------------------------------------------------------- /src/lda.py: -------------------------------------------------------------------------------- 1 | from corpus import * 2 | from interactive_plot import * 3 | from numpy import argsort, cumsum, log, ones, random, searchsorted, sum, zeros 4 | import os, sys 5 | 6 | 7 | class LDA(object): 8 | 9 | def log_prob(self): 10 | 11 | beta, beta_sum = self.beta, self.beta_sum 12 | alpha, alpha_sum = self.alpha, self.alpha_sum 13 | nwt, nt, ntd = self.nwt, self.nt, self.ntd 14 | 15 | lp = 0.0 16 | 17 | nwt.fill(0) 18 | nt.fill(0) 19 | 20 | ntd.fill(0) 21 | 22 | for d, (doc, zd) in enumerate(zip(self.corpus, self.z)): 23 | for n, (w, t) in enumerate(zip(doc.tokens, zd)): 24 | 25 | lp += log((nwt[w, t] + beta[w]) / (nt[t] + beta_sum) * (ntd[t, d] + alpha[t]) / (n + alpha_sum)) 26 | 27 | nwt[w, t] += 1 28 | nt[t] += 1 29 | ntd[t, d] += 1 30 | 31 | return lp 32 | 33 | def print_topics(self, num=5): 34 | 35 | beta = self.beta 36 | nwt = self.nwt 37 | 38 | alphabet = self.corpus.alphabet 39 | 40 | for t in xrange(self.T): 41 | 42 | sorted_types = map(alphabet.lookup, argsort(nwt[:, t] + beta)) 43 | print 'Topic %s: %s' % (t+1, ' '.join(sorted_types[-num:][::-1])) 44 | 45 | def save_state(self, filename): 46 | 47 | alphabet = self.corpus.alphabet 48 | 49 | f = open(filename, 'wb') 50 | 51 | for d, (doc, zd) in enumerate(zip(self.corpus, self.z)): 52 | for n, (w, t) in enumerate(zip(doc.tokens, zd)): 53 | f.write('%s %s %s %s %s\n' % (d, n, w, alphabet.lookup(w), t)) 54 | 55 | f.close() 56 | 57 | def __init__(self, corpus, T, S, optimize, dirname): 58 | 59 | # random.seed(1000) 60 | 61 | self.corpus = corpus 62 | 63 | self.D = D = len(corpus) 64 | self.N = N = sum(len(doc) for doc in corpus) 65 | self.W = W = len(corpus.alphabet) 66 | 67 | self.T = T 68 | self.S = S 69 | 70 | self.optimize = optimize 71 | 72 | self.dirname = dirname 73 | 74 | if not os.path.exists(dirname): 75 | os.makedirs(dirname) 76 | 77 | assert not os.listdir(dirname), 'Output directory must be empty.' 78 | 79 | print '# documents =', D 80 | print '# tokens =', N 81 | print '# unique types =', W 82 | print '# topics =', T 83 | print '# iterations =', S 84 | print 'Optimize hyperparameters =', optimize 85 | 86 | self.beta = 0.01 * ones(W) 87 | self.beta_sum = 0.01 * W 88 | 89 | self.alpha = 0.1 * ones(T) 90 | self.alpha_sum = 0.1 * T 91 | 92 | self.nwt = zeros((W, T), dtype=int) 93 | self.nt = zeros(T, dtype=int) 94 | 95 | self.ntd = zeros((T, D), dtype=int) 96 | 97 | self.z = z = [] 98 | 99 | for doc in corpus: 100 | z.append(zeros(len(doc), dtype=int)) 101 | 102 | def inference(self): 103 | 104 | self.sample_topics(init=True) 105 | 106 | lp = self.log_prob() 107 | 108 | plt = InteractivePlot('Iteration', 'Log Probability') 109 | plt.update_plot(0, lp) 110 | 111 | print '\nIteration %s: %s' % (0, lp) 112 | self.print_topics() 113 | 114 | for s in xrange(1, self.S+1): 115 | 116 | sys.stdout.write('.') 117 | 118 | if s % 10 == 0: 119 | 120 | lp = self.log_prob() 121 | 122 | plt.update_plot(s, lp) 123 | 124 | print '\nIteration %s: %s' % (s, lp) 125 | self.print_topics() 126 | 127 | self.save_state('%s/state.txt.%s' % (self.dirname, s)) 128 | 129 | self.sample_topics() 130 | 131 | def sample_topics(self, init=False): 132 | 133 | beta, beta_sum = self.beta, self.beta_sum 134 | alpha, alpha_sum = self.alpha, self.alpha_sum 135 | nwt, nt, ntd = self.nwt, self.nt, self.ntd 136 | 137 | for d, (doc, zd) in enumerate(zip(self.corpus, self.z)): 138 | for n, (w, t) in enumerate(zip(doc.tokens, zd)): 139 | if not init: 140 | nwt[w, t] -= 1 141 | nt[t] -= 1 142 | ntd[t, d] -= 1 143 | 144 | dist = (nwt[w, :] + beta[w]) / (nt + beta_sum) * (ntd[:, d] + alpha) 145 | 146 | dist_sum = cumsum(dist) 147 | r = random.random() * dist_sum[-1] 148 | t = searchsorted(dist_sum, r) 149 | 150 | nwt[w, t] += 1 151 | nt[t] += 1 152 | ntd[t, d] += 1 153 | 154 | zd[n] = t 155 | -------------------------------------------------------------------------------- /src/postprocess.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from glob import glob 3 | from itertools import islice 4 | from numpy import argmin, argsort, finfo, reshape, size, sum, unique, vstack, zeros 5 | from utils import * 6 | from corpus import * 7 | 8 | 9 | def aggregate(regexp, col=-1, header=0): 10 | 11 | files = glob(regexp) 12 | 13 | tmp = defaultdict(list) 14 | 15 | for statefile in files: 16 | for n, row in enumerate(islice(open(statefile), header, None)): 17 | tmp[n].append(row.split()[col]) 18 | 19 | S = len(files) 20 | N = len(tmp) 21 | 22 | values = zeros((N, S), dtype=int) 23 | 24 | for n in xrange(N): 25 | values[n, :] = tmp[n] 26 | 27 | if S == 1: 28 | values = reshape(values, N) 29 | 30 | return values 31 | 32 | 33 | def count_unique_values(values): 34 | 35 | tmp = zeros(len(values), dtype=int) 36 | 37 | for n, v in enumerate(values): 38 | tmp[n] = len(unique(v)) 39 | 40 | return tmp 41 | 42 | 43 | def get_empirical_phi(statefile): 44 | 45 | nwt = get_nwt(statefile) 46 | phi = nwt / (sum(nwt, 0).astype('float') + finfo('float').eps) 47 | 48 | return phi 49 | 50 | 51 | def get_empirical_theta(statefile): 52 | 53 | ntd = get_ntd(statefile) 54 | theta = ntd / (sum(ntd, 0).astype('float') + finfo('float').eps) 55 | 56 | return theta 57 | 58 | 59 | def get_nwt(statefile): 60 | 61 | w_idx = aggregate(statefile, col=2) 62 | t_idx = aggregate(statefile) 63 | 64 | return get_2d_counts(vstack((w_idx, t_idx)).T) 65 | 66 | 67 | def get_ntd(statefile): 68 | 69 | t_idx = aggregate(statefile) 70 | d_idx = aggregate(statefile, col=0) 71 | 72 | return get_2d_counts(vstack((t_idx, d_idx)).T) 73 | 74 | 75 | def get_nw(statefile): 76 | 77 | w_idx = aggregate(statefile, col=2) 78 | 79 | return get_1d_counts(w_idx) 80 | 81 | 82 | def get_nt(statefile): 83 | 84 | t_idx = aggregate(statefile) 85 | 86 | return get_1d_counts(t_idx) 87 | 88 | 89 | def get_1d_counts(data): 90 | 91 | tmp = defaultdict(int) 92 | 93 | for i in data: 94 | tmp[i] += 1 95 | 96 | I = len(tmp) 97 | 98 | counts = zeros(I, dtype=int) 99 | 100 | for i in xrange(I): 101 | counts[i] = tmp[i] 102 | 103 | return counts 104 | 105 | 106 | def get_2d_counts(data): 107 | 108 | tmp = defaultdict(int) 109 | 110 | I = J = 1 111 | 112 | for x in data: 113 | 114 | i, j = x[0], x[1] 115 | I, J = max(I, i+1), max(J, j+1) 116 | 117 | tmp[(i, j)] += 1 118 | 119 | counts = zeros((I, J), dtype=int) 120 | 121 | for i in xrange(I): 122 | for j in xrange(J): 123 | counts[i, j] = tmp[(i, j)] 124 | 125 | return counts 126 | 127 | 128 | def align_topics(statefile1, statefile2): 129 | 130 | phi1 = get_empirical_phi(statefile1) 131 | phi2 = get_empirical_phi(statefile2) 132 | 133 | T1, T2 = size(phi1, 1), size(phi2, 1) 134 | 135 | dist = zeros((T1, T2)) 136 | 137 | for i in xrange(T1): 138 | for j in xrange(T2): 139 | dist[i, j] = hellinger(phi1[:, i], phi2[:, j]) 140 | 141 | return dist 142 | 143 | 144 | def get_corpus(statefile, header=0): 145 | 146 | corpus = Corpus() 147 | 148 | name = None 149 | tokens = [] 150 | 151 | for row in islice(open(statefile), header, None): 152 | 153 | fields = row.split() 154 | 155 | if fields[1] == '0': 156 | 157 | if name and tokens: 158 | corpus.add(name, tokens) 159 | 160 | name = fields[0] 161 | tokens = [] 162 | 163 | tokens.append(fields[3]) 164 | 165 | if name and tokens: 166 | corpus.add(name, tokens) 167 | 168 | return corpus 169 | -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse, csv, re 2 | from corpus import * 3 | from numpy import * 4 | 5 | 6 | def create_stopword_list(f): 7 | 8 | if not f: 9 | return set() 10 | 11 | if isinstance(f, basestring): 12 | f = file(f) 13 | 14 | return set(word.strip() for word in f) 15 | 16 | 17 | def tokenize(data, stopwords=set()): 18 | 19 | tokens = re.findall('[a-z]+', data.lower()) 20 | 21 | return [x for x in tokens if x not in stopwords] 22 | 23 | 24 | def main(): 25 | 26 | # parse command-line arguments 27 | 28 | parser = argparse.ArgumentParser() 29 | 30 | parser.add_argument('input_file', metavar='input-file', help='CSV file to be preprocessed') 31 | parser.add_argument('--remove-stopwords', metavar='stopword-file', help='remove stopwords provided in the specified file') 32 | parser.add_argument('--output-file', metavar='output-file', help='save preprocessed data to the specified file') 33 | 34 | args = parser.parse_args() 35 | 36 | # create stopword list 37 | 38 | stopwords = create_stopword_list(args.remove_stopwords) 39 | 40 | # preprocess data 41 | 42 | corpus = Corpus() 43 | 44 | for name, label, data in csv.reader(open(args.input_file), delimiter='\t'): 45 | corpus.add(name, tokenize(data, stopwords)) 46 | 47 | print '# documents =', len(corpus) 48 | print '# tokens =', sum(len(doc) for doc in corpus) 49 | print '# unique types =', len(corpus.alphabet) 50 | 51 | if args.output_file: 52 | corpus.save(args.output_file) 53 | 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from numpy import log2, sqrt, sum 2 | 3 | 4 | def entropy(p): 5 | 6 | return -sum(p * log2(p)) 7 | 8 | 9 | def hellinger(p, q): 10 | 11 | return sqrt(sum((sqrt(p) - sqrt(q))**2)) 12 | 13 | 14 | def kl(p, q): 15 | 16 | return sum(p * log2(p / q)) 17 | 18 | 19 | def jensen_shannon(p, q): 20 | 21 | m = 0.5 * (p + q) 22 | 23 | return 0.5 * kl(p, m) + 0.5 * kl(q, m) 24 | 25 | 26 | def euclidean(p, q): 27 | 28 | return sqrt(sum((p - q)**2)) 29 | --------------------------------------------------------------------------------