├── Client.py ├── README.md ├── Server.py ├── extra_dict ├── dict.txt.big ├── idf.txt └── stop_words.utf8 ├── gen-py ├── __init__.py └── rpc │ ├── DocServlet-remote │ ├── DocServlet.py │ ├── DocServlet.pyc │ ├── __init__.py │ ├── __init__.pyc │ ├── constants.py │ ├── ttypes.py │ └── ttypes.pyc ├── mylog.py ├── rpc.thrift ├── start-infer_server.sh ├── stop-infer_server.sh └── weibo_cluster.py /Client.py: -------------------------------------------------------------------------------- 1 | import sys, glob 2 | sys.path.append('gen-py') 3 | #sys.path.insert(0, glob.glob('../../lib/py/build/lib.*')[0]) 4 | 5 | from rpc import DocServlet 6 | from rpc.ttypes import * 7 | 8 | from thrift import Thrift 9 | from thrift.transport import TSocket 10 | from thrift.transport import TTransport 11 | from thrift.protocol import TBinaryProtocol 12 | from thrift.protocol import TCompactProtocol 13 | 14 | try: 15 | # Make socket 16 | transport = TSocket.TSocket('localhost', 9099) 17 | # Buffering is critical. Raw sockets are very slow 18 | transport = TTransport.TBufferedTransport(transport) 19 | # Wrap in a protocol 20 | #protocol = TBinaryProtocol.TBinaryProtocol(transport) 21 | protocol = TCompactProtocol.TCompactProtocol(transport) 22 | # Create a client to use the protocol encoder 23 | client = DocServlet.Client(protocol) 24 | # Connect! 25 | transport.open() 26 | 27 | doc_list = [] 28 | fp = open('./result/infer.txt','r') 29 | for line in fp: 30 | doc = Doc_Info('1',line.strip() ) 31 | doc_list.append(doc); 32 | fp.close() 33 | topics = client.Infer(doc_list,15,10,15) 34 | fw = open('result.txt','w') 35 | for topic in topics: 36 | #print tag 37 | fw.write(str(topic.topicid)+':\t');fw.write(topic.topicwords+'\n') 38 | for one in topic.doclist: 39 | fw.write(one.docid+'\t'+str(one.consinesim)+'\t'+one.text+'\n' ) 40 | fw.write('===================================\n') 41 | fw.close() 42 | 43 | transport.close() 44 | except Thrift.TException, tx: 45 | print '%s' % (tx.message) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tag doc using topN words with lda 2 | --- 3 | 利用LDA的历史训练,为新文章自动打Tag的thrift服务。 4 | 5 | 该方法通过infer得到文档的theta分布,找到该文档概率最大主题编号,然后根据phi分布获取主题编号的topN词,作为文章标签输出。 6 | 7 | collapsed Gibbs LDA reference : [my blog](http://nanjunxiao.github.io/2015/08/07/Topic-Model-LDA%E7%90%86%E8%AE%BA%E7%AF%87/ ) 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import sys, glob 4 | import os 5 | import math 6 | import time 7 | import json 8 | import glob 9 | import jieba 10 | 11 | sys.path.append('gen-py') 12 | #sys.path.insert(0, glob.glob('../../lib/py/build/lib.*')[0]) 13 | from rpc import DocServlet 14 | from rpc.ttypes import * 15 | 16 | from thrift.transport import TSocket 17 | from thrift.transport import TTransport 18 | #from thrift.protocol import TBinaryProtocol 19 | from thrift.protocol import TCompactProtocol 20 | from thrift.server import TServer 21 | 22 | #================================= 23 | exedirname=os.path.dirname(os.path.abspath(sys.argv[0]) ) 24 | stopwordsfilename = '%s/extra_dict/stop_words.utf8' %exedirname 25 | inference_data_file = '%s/result/toinfer.doc'%exedirname 26 | inference_result_file = '%s/result/inference_result'%exedirname 27 | model_file = '%s/result/model_file'%exedirname 28 | #================================= 29 | 30 | def json2dict(text): 31 | try: 32 | djson = json.loads(text,encoding='utf-8') 33 | except Exception,e: 34 | print >> sys.stderr, text , e 35 | return None 36 | return djson 37 | def loadstopwords(filename): 38 | fp = open(filename, 'rb') 39 | stopwords = set() 40 | for line in fp: 41 | #jieba cut is unicode 42 | stopwords.add(line.strip('\n').decode('utf-8')) 43 | fp.close() 44 | return stopwords 45 | def loadtrainresult(showfilename): 46 | fp = open(showfilename,'r') 47 | topic_list = [] 48 | for line in fp: 49 | topic_list.append( json2dict(line.strip() ) ) 50 | fp.close() 51 | return topic_list 52 | """文档属于那个类""" 53 | def whichcluster(listweight): 54 | return max(enumerate(listweight), key=lambda x: x[1])[0] 55 | 56 | class DocServletHandler: 57 | def __init__(self): 58 | self.stopwords = loadstopwords(stopwordsfilename) 59 | filebeg = time.strftime('%Y%m%d',time.localtime(time.time() - 86400) ); fileend = '.result'; showfilename = glob.glob('%s/result/%s*%s'%(exedirname,filebeg,fileend) )[0] 60 | #print 'show: ',showfilename 61 | self.topic_list = loadtrainresult(showfilename) 62 | self.topicnum = len(self.topic_list);self.alpha = 50.0/self.topicnum 63 | pass 64 | def Ping(self, sth): 65 | print 'receive: %s\n' % (sth) 66 | return 'Pang...' 67 | 68 | def Infer(self, docs, burn_in_iterations=15, accumulating_iterations=10, docnumoftopic=15): 69 | topics = []; toinfer_index= [] 70 | try: 71 | fp = open(inference_data_file,'w') 72 | inferdocnum = 0 73 | for doc in docs: 74 | text_seg = jieba.cut(doc.text.strip() ) 75 | text_result = list(set(text_seg) - self.stopwords) 76 | content = ' 1 '.join(text_result) 77 | if content != '': 78 | toinfer_index.append(inferdocnum) 79 | fp.write(content.encode('utf-8') +'\n') 80 | inferdocnum += 1 81 | fp.close() 82 | print '----------------------------------------------------->' 83 | print 'N:%d\t%s\n' %(len(docs),time.asctime()) 84 | print ('time ./infer --alpha %f --beta 0.1 --inference_data_file %s --inference_result_file %s --model_file %s --burn_in_iterations %d --accumulating_iterations %d '\ 85 | %(self.alpha,inference_data_file,inference_result_file,model_file, burn_in_iterations,accumulating_iterations) ) 86 | os.system('time ./infer --alpha %f --beta 0.1 --inference_data_file %s --inference_result_file %s --model_file %s --burn_in_iterations %d --accumulating_iterations %d '\ 87 | %(self.alpha,inference_data_file,inference_result_file,model_file, burn_in_iterations,accumulating_iterations) ) 88 | fp = open(inference_result_file,'r') 89 | infer_result_which = [] 90 | for line in fp: 91 | line_list = line.strip().split() 92 | infer_result_which.append(whichcluster(line_list) ) 93 | fp.close() 94 | counter=0 95 | for i,index in enumerate(toinfer_index): 96 | for j in range(index-counter ): 97 | topics.append(Topic_Info() ) 98 | onetopic = Topic_Info();which=infer_result_which[i];onetopic.topicid=which;onetopic.topicwords=self.topic_list[which]['topicwords'].encode('utf-8'); 99 | docnumoftopic = min(docnumoftopic,len(self.topic_list[which]['doclist'] ) ) 100 | for jj in range(docnumoftopic): 101 | onetopic.doclist.append(Doc_Info(self.topic_list[which]['doclist'][jj]['docid'], self.topic_list[which]['doclist'][jj]['text'].encode('utf-8'),self.topic_list[which]['doclist'][jj]['consinesim'] ) ) 102 | topics.append(onetopic);counter=index+1 103 | if counter> sys.stderr, e 111 | 112 | return topics 113 | 114 | def Sender2(self,docs): 115 | for doc in docs: 116 | print doc.text 117 | 118 | handler = DocServletHandler() 119 | processor = DocServlet.Processor(handler) 120 | transport = TSocket.TServerSocket("127.0.0.1",port=9099) 121 | tfactory = TTransport.TBufferedTransportFactory() 122 | #pfactory = TBinaryProtocol.TBinaryProtocolFactory() 123 | pfactory = TCompactProtocol.TCompactProtocolFactory() 124 | server = TServer.TSimpleServer(processor, transport, tfactory, pfactory) 125 | 126 | # You could do one of these for a multithreaded server 127 | #server = TServer.TThreadedServer(processor, transport, tfactory, pfactory) 128 | #server = TServer.TThreadPoolServer(processor, transport, tfactory, pfactory) 129 | 130 | print 'Starting the server...' 131 | server.serve() 132 | print 'done.' 133 | -------------------------------------------------------------------------------- /extra_dict/stop_words.utf8: -------------------------------------------------------------------------------- 1 | " 2 | . 3 | 。 4 | , 5 | 、 6 | ! 7 | ? 8 | : 9 | ; 10 | ` 11 | ﹑ 12 | • 13 | " 14 | ^ 15 | … 16 | ‘ 17 | ’ 18 | “ 19 | ” 20 | 〝 21 | 〞 22 | ~ 23 | \ 24 | ∕ 25 | | 26 | ¦ 27 | ‖ 28 | —  29 | ( 30 | ) 31 | 〈 32 | 〉 33 | ﹞ 34 | ﹝ 35 | 「 36 | 」 37 | ‹ 38 | › 39 | 〖 40 | 〗 41 | 】 42 | 【 43 | » 44 | « 45 | 』 46 | 『 47 | 〕 48 | 〔 49 | 》 50 | 《 51 | } 52 | { 53 | ] 54 | [ 55 | ﹐ 56 | ¸ 57 | ﹕ 58 | ︰ 59 | ﹔ 60 | ; 61 | ! 62 | ¡ 63 | ? 64 | ¿ 65 | ﹖ 66 | ﹌ 67 | ﹏ 68 | ﹋ 69 | ' 70 | ´ 71 | ˊ 72 | ˋ 73 | - 74 | ― 75 | ﹫ 76 | @ 77 | ︳ 78 | ︴ 79 | _ 80 | ¯ 81 | _ 82 |  ̄ 83 | ﹢ 84 | + 85 | ﹦ 86 | = 87 | ﹤ 88 | ‐ 89 | < 90 | ­ 91 | ˜ 92 | ~ 93 | ﹟ 94 | # 95 | ﹩ 96 | $ 97 | ﹠ 98 | & 99 | ﹪ 100 | % 101 | ﹡ 102 | * 103 | ﹨ 104 | \ 105 | ﹍ 106 | ﹉ 107 | ﹎ 108 | ﹊ 109 | ˇ 110 | ︵ 111 | ︶ 112 | ︷ 113 | ︸ 114 | ︹ 115 | ︿ 116 | ﹀ 117 | ︺ 118 | ︽ 119 | ︾ 120 | _ 121 | ˉ 122 | ﹁ 123 | ﹂ 124 | ﹃ 125 | ﹄ 126 | ︻ 127 | ︼ 128 | 的 129 | 了 130 | the 131 | a 132 | an 133 | that 134 | those 135 | this 136 | that 137 | $ 138 | 0 139 | 1 140 | 2 141 | 3 142 | 4 143 | 5 144 | 6 145 | 7 146 | 8 147 | 9 148 | ? 149 | _ 150 | “ 151 | ” 152 | 、 153 | 。 154 | 《 155 | 》 156 | 一 157 | 一些 158 | 一何 159 | 一切 160 | 一则 161 | 一方面 162 | 一旦 163 | 一来 164 | 一样 165 | 一般 166 | 一转眼 167 | 万一 168 | 上 169 | 上下 170 | 下 171 | 不 172 | 不仅 173 | 不但 174 | 不光 175 | 不单 176 | 不只 177 | 不外乎 178 | 不如 179 | 不妨 180 | 不尽 181 | 不尽然 182 | 不得 183 | 不怕 184 | 不惟 185 | 不成 186 | 不拘 187 | 不料 188 | 不是 189 | 不比 190 | 不然 191 | 不特 192 | 不独 193 | 不管 194 | 不至于 195 | 不若 196 | 不论 197 | 不过 198 | 不问 199 | 与 200 | 与其 201 | 与其说 202 | 与否 203 | 与此同时 204 | 且 205 | 且不说 206 | 且说 207 | 两者 208 | 个 209 | 个别 210 | 临 211 | 为 212 | 为了 213 | 为什么 214 | 为何 215 | 为止 216 | 为此 217 | 为着 218 | 乃 219 | 乃至 220 | 乃至于 221 | 么 222 | 之 223 | 之一 224 | 之所以 225 | 之类 226 | 乌乎 227 | 乎 228 | 乘 229 | 也 230 | 也好 231 | 也罢 232 | 了 233 | 二来 234 | 于 235 | 于是 236 | 于是乎 237 | 云云 238 | 云尔 239 | 些 240 | 亦 241 | 人 242 | 人们 243 | 人家 244 | 什么 245 | 什么样 246 | 今 247 | 介于 248 | 仍 249 | 仍旧 250 | 从 251 | 从此 252 | 从而 253 | 他 254 | 他人 255 | 他们 256 | 以 257 | 以上 258 | 以为 259 | 以便 260 | 以免 261 | 以及 262 | 以故 263 | 以期 264 | 以来 265 | 以至 266 | 以至于 267 | 以致 268 | 们 269 | 任 270 | 任何 271 | 任凭 272 | 似的 273 | 但 274 | 但凡 275 | 但是 276 | 何 277 | 何以 278 | 何况 279 | 何处 280 | 何时 281 | 余外 282 | 作为 283 | 你 284 | 你们 285 | 使 286 | 使得 287 | 例如 288 | 依 289 | 依据 290 | 依照 291 | 便于 292 | 俺 293 | 俺们 294 | 倘 295 | 倘使 296 | 倘或 297 | 倘然 298 | 倘若 299 | 借 300 | 假使 301 | 假如 302 | 假若 303 | 傥然 304 | 像 305 | 儿 306 | 先不先 307 | 光是 308 | 全体 309 | 全部 310 | 兮 311 | 关于 312 | 其 313 | 其一 314 | 其中 315 | 其二 316 | 其他 317 | 其余 318 | 其它 319 | 其次 320 | 具体地说 321 | 具体说来 322 | 兼之 323 | 内 324 | 再 325 | 再其次 326 | 再则 327 | 再有 328 | 再者 329 | 再者说 330 | 再说 331 | 冒 332 | 冲 333 | 况且 334 | 几 335 | 几时 336 | 凡 337 | 凡是 338 | 凭 339 | 凭借 340 | 出于 341 | 出来 342 | 分别 343 | 则 344 | 则甚 345 | 别 346 | 别人 347 | 别处 348 | 别是 349 | 别的 350 | 别管 351 | 别说 352 | 到 353 | 前后 354 | 前此 355 | 前者 356 | 加之 357 | 加以 358 | 即 359 | 即令 360 | 即使 361 | 即便 362 | 即如 363 | 即或 364 | 即若 365 | 却 366 | 去 367 | 又 368 | 又及 369 | 及 370 | 及其 371 | 及至 372 | 反之 373 | 反而 374 | 反过来 375 | 反过来说 376 | 受到 377 | 另 378 | 另一方面 379 | 另外 380 | 另悉 381 | 只 382 | 只当 383 | 只怕 384 | 只是 385 | 只有 386 | 只消 387 | 只要 388 | 只限 389 | 叫 390 | 叮咚 391 | 可 392 | 可以 393 | 可是 394 | 可见 395 | 各 396 | 各个 397 | 各位 398 | 各种 399 | 各自 400 | 同 401 | 同时 402 | 后 403 | 后者 404 | 向 405 | 向使 406 | 向着 407 | 吓 408 | 吗 409 | 否则 410 | 吧 411 | 吧哒 412 | 吱 413 | 呀 414 | 呃 415 | 呕 416 | 呗 417 | 呜 418 | 呜呼 419 | 呢 420 | 呵 421 | 呵呵 422 | 呸 423 | 呼哧 424 | 咋 425 | 和 426 | 咚 427 | 咦 428 | 咧 429 | 咱 430 | 咱们 431 | 咳 432 | 哇 433 | 哈 434 | 哈哈 435 | 哉 436 | 哎 437 | 哎呀 438 | 哎哟 439 | 哗 440 | 哟 441 | 哦 442 | 哩 443 | 哪 444 | 哪个 445 | 哪些 446 | 哪儿 447 | 哪天 448 | 哪年 449 | 哪怕 450 | 哪样 451 | 哪边 452 | 哪里 453 | 哼 454 | 哼唷 455 | 唉 456 | 唯有 457 | 啊 458 | 啐 459 | 啥 460 | 啦 461 | 啪达 462 | 啷当 463 | 喂 464 | 喏 465 | 喔唷 466 | 喽 467 | 嗡 468 | 嗡嗡 469 | 嗬 470 | 嗯 471 | 嗳 472 | 嘎 473 | 嘎登 474 | 嘘 475 | 嘛 476 | 嘻 477 | 嘿 478 | 嘿嘿 479 | 因 480 | 因为 481 | 因了 482 | 因此 483 | 因着 484 | 因而 485 | 固然 486 | 在 487 | 在下 488 | 在于 489 | 地 490 | 基于 491 | 处在 492 | 多 493 | 多么 494 | 多少 495 | 大 496 | 大家 497 | 她 498 | 她们 499 | 好 500 | 如 501 | 如上 502 | 如上所述 503 | 如下 504 | 如何 505 | 如其 506 | 如同 507 | 如是 508 | 如果 509 | 如此 510 | 如若 511 | 始而 512 | 孰料 513 | 孰知 514 | 宁 515 | 宁可 516 | 宁愿 517 | 宁肯 518 | 它 519 | 它们 520 | 对 521 | 对于 522 | 对待 523 | 对方 524 | 对比 525 | 将 526 | 小 527 | 尔 528 | 尔后 529 | 尔尔 530 | 尚且 531 | 就 532 | 就是 533 | 就是了 534 | 就是说 535 | 就算 536 | 就要 537 | 尽 538 | 尽管 539 | 尽管如此 540 | 岂但 541 | 己 542 | 已 543 | 已矣 544 | 巴 545 | 巴巴 546 | 并 547 | 并且 548 | 并非 549 | 庶乎 550 | 庶几 551 | 开外 552 | 开始 553 | 归 554 | 归齐 555 | 当 556 | 当地 557 | 当然 558 | 当着 559 | 彼 560 | 彼时 561 | 彼此 562 | 往 563 | 待 564 | 很 565 | 得 566 | 得了 567 | 怎 568 | 怎么 569 | 怎么办 570 | 怎么样 571 | 怎奈 572 | 怎样 573 | 总之 574 | 总的来看 575 | 总的来说 576 | 总的说来 577 | 总而言之 578 | 恰恰相反 579 | 您 580 | 惟其 581 | 慢说 582 | 我 583 | 我们 584 | 或 585 | 或则 586 | 或是 587 | 或曰 588 | 或者 589 | 截至 590 | 所 591 | 所以 592 | 所在 593 | 所幸 594 | 所有 595 | 才 596 | 才能 597 | 打 598 | 打从 599 | 把 600 | 抑或 601 | 拿 602 | 按 603 | 按照 604 | 换句话说 605 | 换言之 606 | 据 607 | 据此 608 | 接着 609 | 故 610 | 故此 611 | 故而 612 | 旁人 613 | 无 614 | 无宁 615 | 无论 616 | 既 617 | 既往 618 | 既是 619 | 既然 620 | 时候 621 | 是 622 | 是以 623 | 是的 624 | 曾 625 | 替 626 | 替代 627 | 最 628 | 有 629 | 有些 630 | 有关 631 | 有及 632 | 有时 633 | 有的 634 | 望 635 | 朝 636 | 朝着 637 | 本 638 | 本人 639 | 本地 640 | 本着 641 | 本身 642 | 来 643 | 来着 644 | 来自 645 | 来说 646 | 极了 647 | 果然 648 | 果真 649 | 某 650 | 某个 651 | 某些 652 | 某某 653 | 根据 654 | 欤 655 | 正值 656 | 正如 657 | 正巧 658 | 正是 659 | 此 660 | 此地 661 | 此处 662 | 此外 663 | 此时 664 | 此次 665 | 此间 666 | 毋宁 667 | 每 668 | 每当 669 | 比 670 | 比及 671 | 比如 672 | 比方 673 | 没奈何 674 | 沿 675 | 沿着 676 | 漫说 677 | 焉 678 | 然则 679 | 然后 680 | 然而 681 | 照 682 | 照着 683 | 犹且 684 | 犹自 685 | 甚且 686 | 甚么 687 | 甚或 688 | 甚而 689 | 甚至 690 | 甚至于 691 | 用 692 | 用来 693 | 由 694 | 由于 695 | 由是 696 | 由此 697 | 由此可见 698 | 的 699 | 的确 700 | 的话 701 | 直到 702 | 相对而言 703 | 省得 704 | 看 705 | 眨眼 706 | 着 707 | 着呢 708 | 矣 709 | 矣乎 710 | 矣哉 711 | 离 712 | 竟而 713 | 第 714 | 等 715 | 等到 716 | 等等 717 | 简言之 718 | 管 719 | 类如 720 | 紧接着 721 | 纵 722 | 纵令 723 | 纵使 724 | 纵然 725 | 经 726 | 经过 727 | 结果 728 | 给 729 | 继之 730 | 继后 731 | 继而 732 | 综上所述 733 | 罢了 734 | 者 735 | 而 736 | 而且 737 | 而况 738 | 而后 739 | 而外 740 | 而已 741 | 而是 742 | 而言 743 | 能 744 | 能否 745 | 腾 746 | 自 747 | 自个儿 748 | 自从 749 | 自各儿 750 | 自后 751 | 自家 752 | 自己 753 | 自打 754 | 自身 755 | 至 756 | 至于 757 | 至今 758 | 至若 759 | 致 760 | 般的 761 | 若 762 | 若夫 763 | 若是 764 | 若果 765 | 若非 766 | 莫不然 767 | 莫如 768 | 莫若 769 | 虽 770 | 虽则 771 | 虽然 772 | 虽说 773 | 被 774 | 要 775 | 要不 776 | 要不是 777 | 要不然 778 | 要么 779 | 要是 780 | 譬喻 781 | 譬如 782 | 让 783 | 许多 784 | 论 785 | 设使 786 | 设或 787 | 设若 788 | 诚如 789 | 诚然 790 | 该 791 | 说来 792 | 诸 793 | 诸位 794 | 诸如 795 | 谁 796 | 谁人 797 | 谁料 798 | 谁知 799 | 贼死 800 | 赖以 801 | 赶 802 | 起 803 | 起见 804 | 趁 805 | 趁着 806 | 越是 807 | 距 808 | 跟 809 | 较 810 | 较之 811 | 边 812 | 过 813 | 还 814 | 还是 815 | 还有 816 | 还要 817 | 这 818 | 这一来 819 | 这个 820 | 这么 821 | 这么些 822 | 这么样 823 | 这么点儿 824 | 这些 825 | 这会儿 826 | 这儿 827 | 这就是说 828 | 这时 829 | 这样 830 | 这次 831 | 这般 832 | 这边 833 | 这里 834 | 进而 835 | 连 836 | 连同 837 | 逐步 838 | 通过 839 | 遵循 840 | 遵照 841 | 那 842 | 那个 843 | 那么 844 | 那么些 845 | 那么样 846 | 那些 847 | 那会儿 848 | 那儿 849 | 那时 850 | 那样 851 | 那般 852 | 那边 853 | 那里 854 | 都 855 | 鄙人 856 | 鉴于 857 | 针对 858 | 阿 859 | 除 860 | 除了 861 | 除外 862 | 除开 863 | 除此之外 864 | 除非 865 | 随 866 | 随后 867 | 随时 868 | 随着 869 | 难道说 870 | 非但 871 | 非徒 872 | 非特 873 | 非独 874 | 靠 875 | 顺 876 | 顺着 877 | 首先 878 | ! 879 | , 880 | : 881 | ; 882 | ? 883 | to 884 | can 885 | could 886 | dare 887 | do 888 | did 889 | does 890 | may 891 | might 892 | would 893 | should 894 | must 895 | will 896 | ought 897 | shall 898 | need 899 | is 900 | a 901 | am 902 | are 903 | about 904 | according 905 | after 906 | against 907 | all 908 | almost 909 | also 910 | although 911 | among 912 | an 913 | and 914 | another 915 | any 916 | anything 917 | approximately 918 | as 919 | asked 920 | at 921 | back 922 | because 923 | before 924 | besides 925 | between 926 | both 927 | but 928 | by 929 | call 930 | called 931 | currently 932 | despite 933 | did 934 | do 935 | dr 936 | during 937 | each 938 | earlier 939 | eight 940 | even 941 | eventually 942 | every 943 | everything 944 | five 945 | for 946 | four 947 | from 948 | he 949 | her 950 | here 951 | his 952 | how 953 | however 954 | i 955 | if 956 | in 957 | indeed 958 | instead 959 | it 960 | its 961 | just 962 | last 963 | like 964 | major 965 | many 966 | may 967 | maybe 968 | meanwhile 969 | more 970 | moreover 971 | most 972 | mr 973 | mrs 974 | ms 975 | much 976 | my 977 | neither 978 | net 979 | never 980 | nevertheless 981 | nine 982 | no 983 | none 984 | not 985 | nothing 986 | now 987 | of 988 | on 989 | once 990 | one 991 | only 992 | or 993 | other 994 | our 995 | over 996 | partly 997 | perhaps 998 | prior 999 | regarding 1000 | separately 1001 | seven 1002 | several 1003 | she 1004 | should 1005 | similarly 1006 | since 1007 | six 1008 | so 1009 | some 1010 | somehow 1011 | still 1012 | such 1013 | ten 1014 | that 1015 | the 1016 | their 1017 | then 1018 | there 1019 | therefore 1020 | these 1021 | they 1022 | this 1023 | those 1024 | though 1025 | three 1026 | to 1027 | two 1028 | under 1029 | unless 1030 | unlike 1031 | until 1032 | volume 1033 | we 1034 | what 1035 | whatever 1036 | whats 1037 | when 1038 | where 1039 | which 1040 | while 1041 | why 1042 | with 1043 | without 1044 | yesterday 1045 | yet 1046 | you 1047 | your 1048 | aboard 1049 | about 1050 | above 1051 | according to 1052 | across 1053 | afore 1054 | after 1055 | against 1056 | agin 1057 | along 1058 | alongside 1059 | amid 1060 | amidst 1061 | among 1062 | amongst 1063 | anent 1064 | around 1065 | as 1066 | aslant 1067 | astride 1068 | at 1069 | athwart 1070 | bar 1071 | because of 1072 | before 1073 | behind 1074 | below 1075 | beneath 1076 | beside 1077 | besides 1078 | between 1079 | betwixt 1080 | beyond 1081 | but 1082 | by 1083 | circa 1084 | despite 1085 | down 1086 | during 1087 | due to 1088 | ere 1089 | except 1090 | for 1091 | from 1092 | in 1093 | inside 1094 | into 1095 | less 1096 | like 1097 | mid 1098 | midst 1099 | minus 1100 | near 1101 | next 1102 | nigh 1103 | nigher 1104 | nighest 1105 | notwithstanding 1106 | of 1107 | off 1108 | on 1109 | on to 1110 | onto 1111 | out 1112 | out of 1113 | outside 1114 | over 1115 | past 1116 | pending 1117 | per 1118 | plus 1119 | qua 1120 | re 1121 | round 1122 | sans 1123 | save 1124 | since 1125 | through 1126 | throughout 1127 | thru 1128 | till 1129 | to 1130 | toward 1131 | towards 1132 | under 1133 | underneath 1134 | unlike 1135 | until 1136 | unto 1137 | up 1138 | upon 1139 | versus 1140 | via 1141 | vice 1142 | with 1143 | within 1144 | without 1145 | he 1146 | her 1147 | herself 1148 | hers 1149 | him 1150 | himself 1151 | his 1152 | I 1153 | it 1154 | its 1155 | itself 1156 | me 1157 | mine 1158 | my 1159 | myself 1160 | ours 1161 | she 1162 | their 1163 | theirs 1164 | them 1165 | themselves 1166 | they 1167 | us 1168 | we 1169 | our 1170 | ourselves 1171 | you 1172 | your 1173 | yours 1174 | yourselves 1175 | yourself 1176 | this 1177 | that 1178 | these 1179 | those 1180 | " 1181 | ' 1182 | '' 1183 | ( 1184 | ) 1185 | *LRB* 1186 | *RRB* 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | @ 1193 | & 1194 | [ 1195 | ] 1196 | ` 1197 | `` 1198 | e.g., 1199 | { 1200 | } 1201 | " 1202 | “ 1203 | ” 1204 | -RRB- 1205 | -LRB- 1206 | -- 1207 | a 1208 | about 1209 | above 1210 | across 1211 | after 1212 | afterwards 1213 | again 1214 | against 1215 | all 1216 | almost 1217 | alone 1218 | along 1219 | already 1220 | also 1221 | although 1222 | always 1223 | am 1224 | among 1225 | amongst 1226 | amoungst 1227 | amount 1228 | an 1229 | and 1230 | another 1231 | any 1232 | anyhow 1233 | anyone 1234 | anything 1235 | anyway 1236 | anywhere 1237 | are 1238 | around 1239 | as 1240 | at 1241 | back 1242 | be 1243 | became 1244 | because 1245 | become 1246 | becomes 1247 | becoming 1248 | been 1249 | before 1250 | beforehand 1251 | behind 1252 | being 1253 | below 1254 | beside 1255 | besides 1256 | between 1257 | beyond 1258 | bill 1259 | both 1260 | bottom 1261 | but 1262 | by 1263 | call 1264 | can 1265 | cannot 1266 | cant 1267 | co 1268 | computer 1269 | con 1270 | could 1271 | couldnt 1272 | cry 1273 | de 1274 | describe 1275 | detail 1276 | do 1277 | done 1278 | down 1279 | due 1280 | during 1281 | each 1282 | eg 1283 | eight 1284 | either 1285 | eleven 1286 | else 1287 | elsewhere 1288 | empty 1289 | enough 1290 | etc 1291 | even 1292 | ever 1293 | every 1294 | everyone 1295 | everything 1296 | everywhere 1297 | except 1298 | few 1299 | fifteen 1300 | fify 1301 | fill 1302 | find 1303 | fire 1304 | first 1305 | five 1306 | for 1307 | former 1308 | formerly 1309 | forty 1310 | found 1311 | four 1312 | from 1313 | front 1314 | full 1315 | further 1316 | get 1317 | give 1318 | go 1319 | had 1320 | has 1321 | hasnt 1322 | have 1323 | he 1324 | hence 1325 | her 1326 | here 1327 | hereafter 1328 | hereby 1329 | herein 1330 | hereupon 1331 | hers 1332 | herself 1333 | him 1334 | himself 1335 | his 1336 | how 1337 | however 1338 | hundred 1339 | i 1340 | ie 1341 | if 1342 | in 1343 | inc 1344 | indeed 1345 | interest 1346 | into 1347 | is 1348 | it 1349 | its 1350 | itself 1351 | keep 1352 | last 1353 | latter 1354 | latterly 1355 | least 1356 | less 1357 | ltd 1358 | made 1359 | many 1360 | may 1361 | me 1362 | meanwhile 1363 | might 1364 | mill 1365 | mine 1366 | more 1367 | moreover 1368 | most 1369 | mostly 1370 | move 1371 | much 1372 | must 1373 | my 1374 | myself 1375 | name 1376 | namely 1377 | neither 1378 | never 1379 | nevertheless 1380 | next 1381 | nine 1382 | no 1383 | nobody 1384 | none 1385 | noone 1386 | nor 1387 | not 1388 | nothing 1389 | now 1390 | nowhere 1391 | of 1392 | off 1393 | often 1394 | on 1395 | once 1396 | one 1397 | only 1398 | onto 1399 | or 1400 | other 1401 | others 1402 | otherwise 1403 | our 1404 | ours 1405 | ourselves 1406 | out 1407 | over 1408 | own 1409 | part 1410 | per 1411 | perhaps 1412 | please 1413 | put 1414 | rather 1415 | re 1416 | same 1417 | see 1418 | seem 1419 | seemed 1420 | seeming 1421 | seems 1422 | serious 1423 | several 1424 | she 1425 | should 1426 | show 1427 | side 1428 | since 1429 | sincere 1430 | six 1431 | sixty 1432 | so 1433 | some 1434 | somehow 1435 | someone 1436 | something 1437 | sometime 1438 | sometimes 1439 | somewhere 1440 | still 1441 | such 1442 | system 1443 | take 1444 | ten 1445 | than 1446 | that 1447 | the 1448 | their 1449 | them 1450 | themselves 1451 | then 1452 | thence 1453 | there 1454 | thereafter 1455 | thereby 1456 | therefore 1457 | therein 1458 | thereupon 1459 | these 1460 | they 1461 | thick 1462 | thin 1463 | third 1464 | this 1465 | those 1466 | though 1467 | three 1468 | through 1469 | throughout 1470 | thru 1471 | thus 1472 | to 1473 | together 1474 | too 1475 | top 1476 | toward 1477 | towards 1478 | twelve 1479 | twenty 1480 | two 1481 | un 1482 | under 1483 | until 1484 | up 1485 | upon 1486 | us 1487 | very 1488 | via 1489 | was 1490 | we 1491 | well 1492 | were 1493 | what 1494 | whatever 1495 | when 1496 | whence 1497 | whenever 1498 | where 1499 | whereafter 1500 | whereas 1501 | whereby 1502 | wherein 1503 | whereupon 1504 | wherever 1505 | whether 1506 | which 1507 | while 1508 | whither 1509 | who 1510 | whoever 1511 | whole 1512 | whom 1513 | whose 1514 | why 1515 | will 1516 | with 1517 | within 1518 | without 1519 | would 1520 | yet 1521 | you 1522 | your 1523 | yours 1524 | yourself 1525 | yourselves 1526 | 1527 | 1528 | : 1529 | / 1530 | ( 1531 | > 1532 | ) 1533 | < 1534 | ! 1535 | www 1536 | sina 1537 | com 1538 | alt 1539 | gt 1540 | lt 1541 | 首页 1542 | # 1543 | ## 1544 | http 1545 | cn 1546 | ...... 1547 | ..... 1548 | .... 1549 | ... 1550 | .. 1551 | doge 1552 | · 1553 | t 1554 | 请 1555 | 哒 1556 | 咩 1557 | ~ 1558 | ✌ 1559 | ® 1560 | -------------------------------------------------------------------------------- /gen-py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanjunxiao/tag_doc_with_lda/5ba0051a6d74d42b540f23f97722010929364c9c/gen-py/__init__.py -------------------------------------------------------------------------------- /gen-py/rpc/DocServlet-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift Compiler (0.9.2) 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | # options string: py 8 | # 9 | 10 | import sys 11 | import pprint 12 | from urlparse import urlparse 13 | from thrift.transport import TTransport 14 | from thrift.transport import TSocket 15 | from thrift.transport import TSSLSocket 16 | from thrift.transport import THttpClient 17 | from thrift.protocol import TBinaryProtocol 18 | 19 | from rpc import DocServlet 20 | from rpc.ttypes import * 21 | 22 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 23 | print('') 24 | print('Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] [-s[sl]] function [arg1 [arg2...]]') 25 | print('') 26 | print('Functions:') 27 | print(' string Ping(string sth)') 28 | print(' Topics Infer(Docs docs, i32 burn_in_iterations, i32 accumulating_iterations, i32 docnumoftopic)') 29 | print(' Topics GetTopics(i32 docnumoftopic)') 30 | print(' void Sender2(Docs docs)') 31 | print('') 32 | sys.exit(0) 33 | 34 | pp = pprint.PrettyPrinter(indent = 2) 35 | host = 'localhost' 36 | port = 9090 37 | uri = '' 38 | framed = False 39 | ssl = False 40 | http = False 41 | argi = 1 42 | 43 | if sys.argv[argi] == '-h': 44 | parts = sys.argv[argi+1].split(':') 45 | host = parts[0] 46 | if len(parts) > 1: 47 | port = int(parts[1]) 48 | argi += 2 49 | 50 | if sys.argv[argi] == '-u': 51 | url = urlparse(sys.argv[argi+1]) 52 | parts = url[1].split(':') 53 | host = parts[0] 54 | if len(parts) > 1: 55 | port = int(parts[1]) 56 | else: 57 | port = 80 58 | uri = url[2] 59 | if url[4]: 60 | uri += '?%s' % url[4] 61 | http = True 62 | argi += 2 63 | 64 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 65 | framed = True 66 | argi += 1 67 | 68 | if sys.argv[argi] == '-s' or sys.argv[argi] == '-ssl': 69 | ssl = True 70 | argi += 1 71 | 72 | cmd = sys.argv[argi] 73 | args = sys.argv[argi+1:] 74 | 75 | if http: 76 | transport = THttpClient.THttpClient(host, port, uri) 77 | else: 78 | socket = TSSLSocket.TSSLSocket(host, port, validate=False) if ssl else TSocket.TSocket(host, port) 79 | if framed: 80 | transport = TTransport.TFramedTransport(socket) 81 | else: 82 | transport = TTransport.TBufferedTransport(socket) 83 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 84 | client = DocServlet.Client(protocol) 85 | transport.open() 86 | 87 | if cmd == 'Ping': 88 | if len(args) != 1: 89 | print('Ping requires 1 args') 90 | sys.exit(1) 91 | pp.pprint(client.Ping(args[0],)) 92 | 93 | elif cmd == 'Infer': 94 | if len(args) != 4: 95 | print('Infer requires 4 args') 96 | sys.exit(1) 97 | pp.pprint(client.Infer(eval(args[0]),eval(args[1]),eval(args[2]),eval(args[3]),)) 98 | 99 | elif cmd == 'GetTopics': 100 | if len(args) != 1: 101 | print('GetTopics requires 1 args') 102 | sys.exit(1) 103 | pp.pprint(client.GetTopics(eval(args[0]),)) 104 | 105 | elif cmd == 'Sender2': 106 | if len(args) != 1: 107 | print('Sender2 requires 1 args') 108 | sys.exit(1) 109 | pp.pprint(client.Sender2(eval(args[0]),)) 110 | 111 | else: 112 | print('Unrecognized method %s' % cmd) 113 | sys.exit(1) 114 | 115 | transport.close() 116 | -------------------------------------------------------------------------------- /gen-py/rpc/DocServlet.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.9.2) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | from thrift.Thrift import TProcessor 12 | from thrift.transport import TTransport 13 | from thrift.protocol import TBinaryProtocol, TProtocol 14 | try: 15 | from thrift.protocol import fastbinary 16 | except: 17 | fastbinary = None 18 | 19 | 20 | class Iface: 21 | def Ping(self, sth): 22 | """ 23 | Parameters: 24 | - sth 25 | """ 26 | pass 27 | 28 | def Infer(self, docs, burn_in_iterations, accumulating_iterations, docnumoftopic): 29 | """ 30 | Parameters: 31 | - docs 32 | - burn_in_iterations 33 | - accumulating_iterations 34 | - docnumoftopic 35 | """ 36 | pass 37 | 38 | def GetTopics(self, docnumoftopic): 39 | """ 40 | Parameters: 41 | - docnumoftopic 42 | """ 43 | pass 44 | 45 | def Sender2(self, docs): 46 | """ 47 | Parameters: 48 | - docs 49 | """ 50 | pass 51 | 52 | 53 | class Client(Iface): 54 | def __init__(self, iprot, oprot=None): 55 | self._iprot = self._oprot = iprot 56 | if oprot is not None: 57 | self._oprot = oprot 58 | self._seqid = 0 59 | 60 | def Ping(self, sth): 61 | """ 62 | Parameters: 63 | - sth 64 | """ 65 | self.send_Ping(sth) 66 | return self.recv_Ping() 67 | 68 | def send_Ping(self, sth): 69 | self._oprot.writeMessageBegin('Ping', TMessageType.CALL, self._seqid) 70 | args = Ping_args() 71 | args.sth = sth 72 | args.write(self._oprot) 73 | self._oprot.writeMessageEnd() 74 | self._oprot.trans.flush() 75 | 76 | def recv_Ping(self): 77 | iprot = self._iprot 78 | (fname, mtype, rseqid) = iprot.readMessageBegin() 79 | if mtype == TMessageType.EXCEPTION: 80 | x = TApplicationException() 81 | x.read(iprot) 82 | iprot.readMessageEnd() 83 | raise x 84 | result = Ping_result() 85 | result.read(iprot) 86 | iprot.readMessageEnd() 87 | if result.success is not None: 88 | return result.success 89 | raise TApplicationException(TApplicationException.MISSING_RESULT, "Ping failed: unknown result"); 90 | 91 | def Infer(self, docs, burn_in_iterations, accumulating_iterations, docnumoftopic): 92 | """ 93 | Parameters: 94 | - docs 95 | - burn_in_iterations 96 | - accumulating_iterations 97 | - docnumoftopic 98 | """ 99 | self.send_Infer(docs, burn_in_iterations, accumulating_iterations, docnumoftopic) 100 | return self.recv_Infer() 101 | 102 | def send_Infer(self, docs, burn_in_iterations, accumulating_iterations, docnumoftopic): 103 | self._oprot.writeMessageBegin('Infer', TMessageType.CALL, self._seqid) 104 | args = Infer_args() 105 | args.docs = docs 106 | args.burn_in_iterations = burn_in_iterations 107 | args.accumulating_iterations = accumulating_iterations 108 | args.docnumoftopic = docnumoftopic 109 | args.write(self._oprot) 110 | self._oprot.writeMessageEnd() 111 | self._oprot.trans.flush() 112 | 113 | def recv_Infer(self): 114 | iprot = self._iprot 115 | (fname, mtype, rseqid) = iprot.readMessageBegin() 116 | if mtype == TMessageType.EXCEPTION: 117 | x = TApplicationException() 118 | x.read(iprot) 119 | iprot.readMessageEnd() 120 | raise x 121 | result = Infer_result() 122 | result.read(iprot) 123 | iprot.readMessageEnd() 124 | if result.success is not None: 125 | return result.success 126 | raise TApplicationException(TApplicationException.MISSING_RESULT, "Infer failed: unknown result"); 127 | 128 | def GetTopics(self, docnumoftopic): 129 | """ 130 | Parameters: 131 | - docnumoftopic 132 | """ 133 | self.send_GetTopics(docnumoftopic) 134 | return self.recv_GetTopics() 135 | 136 | def send_GetTopics(self, docnumoftopic): 137 | self._oprot.writeMessageBegin('GetTopics', TMessageType.CALL, self._seqid) 138 | args = GetTopics_args() 139 | args.docnumoftopic = docnumoftopic 140 | args.write(self._oprot) 141 | self._oprot.writeMessageEnd() 142 | self._oprot.trans.flush() 143 | 144 | def recv_GetTopics(self): 145 | iprot = self._iprot 146 | (fname, mtype, rseqid) = iprot.readMessageBegin() 147 | if mtype == TMessageType.EXCEPTION: 148 | x = TApplicationException() 149 | x.read(iprot) 150 | iprot.readMessageEnd() 151 | raise x 152 | result = GetTopics_result() 153 | result.read(iprot) 154 | iprot.readMessageEnd() 155 | if result.success is not None: 156 | return result.success 157 | raise TApplicationException(TApplicationException.MISSING_RESULT, "GetTopics failed: unknown result"); 158 | 159 | def Sender2(self, docs): 160 | """ 161 | Parameters: 162 | - docs 163 | """ 164 | self.send_Sender2(docs) 165 | 166 | def send_Sender2(self, docs): 167 | self._oprot.writeMessageBegin('Sender2', TMessageType.ONEWAY, self._seqid) 168 | args = Sender2_args() 169 | args.docs = docs 170 | args.write(self._oprot) 171 | self._oprot.writeMessageEnd() 172 | self._oprot.trans.flush() 173 | 174 | class Processor(Iface, TProcessor): 175 | def __init__(self, handler): 176 | self._handler = handler 177 | self._processMap = {} 178 | self._processMap["Ping"] = Processor.process_Ping 179 | self._processMap["Infer"] = Processor.process_Infer 180 | self._processMap["GetTopics"] = Processor.process_GetTopics 181 | self._processMap["Sender2"] = Processor.process_Sender2 182 | 183 | def process(self, iprot, oprot): 184 | (name, type, seqid) = iprot.readMessageBegin() 185 | if name not in self._processMap: 186 | iprot.skip(TType.STRUCT) 187 | iprot.readMessageEnd() 188 | x = TApplicationException(TApplicationException.UNKNOWN_METHOD, 'Unknown function %s' % (name)) 189 | oprot.writeMessageBegin(name, TMessageType.EXCEPTION, seqid) 190 | x.write(oprot) 191 | oprot.writeMessageEnd() 192 | oprot.trans.flush() 193 | return 194 | else: 195 | self._processMap[name](self, seqid, iprot, oprot) 196 | return True 197 | 198 | def process_Ping(self, seqid, iprot, oprot): 199 | args = Ping_args() 200 | args.read(iprot) 201 | iprot.readMessageEnd() 202 | result = Ping_result() 203 | result.success = self._handler.Ping(args.sth) 204 | oprot.writeMessageBegin("Ping", TMessageType.REPLY, seqid) 205 | result.write(oprot) 206 | oprot.writeMessageEnd() 207 | oprot.trans.flush() 208 | 209 | def process_Infer(self, seqid, iprot, oprot): 210 | args = Infer_args() 211 | args.read(iprot) 212 | iprot.readMessageEnd() 213 | result = Infer_result() 214 | result.success = self._handler.Infer(args.docs, args.burn_in_iterations, args.accumulating_iterations, args.docnumoftopic) 215 | oprot.writeMessageBegin("Infer", TMessageType.REPLY, seqid) 216 | result.write(oprot) 217 | oprot.writeMessageEnd() 218 | oprot.trans.flush() 219 | 220 | def process_GetTopics(self, seqid, iprot, oprot): 221 | args = GetTopics_args() 222 | args.read(iprot) 223 | iprot.readMessageEnd() 224 | result = GetTopics_result() 225 | result.success = self._handler.GetTopics(args.docnumoftopic) 226 | oprot.writeMessageBegin("GetTopics", TMessageType.REPLY, seqid) 227 | result.write(oprot) 228 | oprot.writeMessageEnd() 229 | oprot.trans.flush() 230 | 231 | def process_Sender2(self, seqid, iprot, oprot): 232 | args = Sender2_args() 233 | args.read(iprot) 234 | iprot.readMessageEnd() 235 | self._handler.Sender2(args.docs) 236 | return 237 | 238 | 239 | # HELPER FUNCTIONS AND STRUCTURES 240 | 241 | class Ping_args: 242 | """ 243 | Attributes: 244 | - sth 245 | """ 246 | 247 | thrift_spec = ( 248 | None, # 0 249 | (1, TType.STRING, 'sth', None, None, ), # 1 250 | ) 251 | 252 | def __init__(self, sth=None,): 253 | self.sth = sth 254 | 255 | def read(self, iprot): 256 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 257 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 258 | return 259 | iprot.readStructBegin() 260 | while True: 261 | (fname, ftype, fid) = iprot.readFieldBegin() 262 | if ftype == TType.STOP: 263 | break 264 | if fid == 1: 265 | if ftype == TType.STRING: 266 | self.sth = iprot.readString(); 267 | else: 268 | iprot.skip(ftype) 269 | else: 270 | iprot.skip(ftype) 271 | iprot.readFieldEnd() 272 | iprot.readStructEnd() 273 | 274 | def write(self, oprot): 275 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 276 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 277 | return 278 | oprot.writeStructBegin('Ping_args') 279 | if self.sth is not None: 280 | oprot.writeFieldBegin('sth', TType.STRING, 1) 281 | oprot.writeString(self.sth) 282 | oprot.writeFieldEnd() 283 | oprot.writeFieldStop() 284 | oprot.writeStructEnd() 285 | 286 | def validate(self): 287 | return 288 | 289 | 290 | def __hash__(self): 291 | value = 17 292 | value = (value * 31) ^ hash(self.sth) 293 | return value 294 | 295 | def __repr__(self): 296 | L = ['%s=%r' % (key, value) 297 | for key, value in self.__dict__.iteritems()] 298 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 299 | 300 | def __eq__(self, other): 301 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 302 | 303 | def __ne__(self, other): 304 | return not (self == other) 305 | 306 | class Ping_result: 307 | """ 308 | Attributes: 309 | - success 310 | """ 311 | 312 | thrift_spec = ( 313 | (0, TType.STRING, 'success', None, None, ), # 0 314 | ) 315 | 316 | def __init__(self, success=None,): 317 | self.success = success 318 | 319 | def read(self, iprot): 320 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 321 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 322 | return 323 | iprot.readStructBegin() 324 | while True: 325 | (fname, ftype, fid) = iprot.readFieldBegin() 326 | if ftype == TType.STOP: 327 | break 328 | if fid == 0: 329 | if ftype == TType.STRING: 330 | self.success = iprot.readString(); 331 | else: 332 | iprot.skip(ftype) 333 | else: 334 | iprot.skip(ftype) 335 | iprot.readFieldEnd() 336 | iprot.readStructEnd() 337 | 338 | def write(self, oprot): 339 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 340 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 341 | return 342 | oprot.writeStructBegin('Ping_result') 343 | if self.success is not None: 344 | oprot.writeFieldBegin('success', TType.STRING, 0) 345 | oprot.writeString(self.success) 346 | oprot.writeFieldEnd() 347 | oprot.writeFieldStop() 348 | oprot.writeStructEnd() 349 | 350 | def validate(self): 351 | return 352 | 353 | 354 | def __hash__(self): 355 | value = 17 356 | value = (value * 31) ^ hash(self.success) 357 | return value 358 | 359 | def __repr__(self): 360 | L = ['%s=%r' % (key, value) 361 | for key, value in self.__dict__.iteritems()] 362 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 363 | 364 | def __eq__(self, other): 365 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 366 | 367 | def __ne__(self, other): 368 | return not (self == other) 369 | 370 | class Infer_args: 371 | """ 372 | Attributes: 373 | - docs 374 | - burn_in_iterations 375 | - accumulating_iterations 376 | - docnumoftopic 377 | """ 378 | 379 | thrift_spec = ( 380 | None, # 0 381 | (1, TType.LIST, 'docs', (TType.STRUCT,(Doc_Info, Doc_Info.thrift_spec)), None, ), # 1 382 | (2, TType.I32, 'burn_in_iterations', None, 15, ), # 2 383 | (3, TType.I32, 'accumulating_iterations', None, 10, ), # 3 384 | (4, TType.I32, 'docnumoftopic', None, 15, ), # 4 385 | ) 386 | 387 | def __init__(self, docs=None, burn_in_iterations=thrift_spec[2][4], accumulating_iterations=thrift_spec[3][4], docnumoftopic=thrift_spec[4][4],): 388 | self.docs = docs 389 | self.burn_in_iterations = burn_in_iterations 390 | self.accumulating_iterations = accumulating_iterations 391 | self.docnumoftopic = docnumoftopic 392 | 393 | def read(self, iprot): 394 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 395 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 396 | return 397 | iprot.readStructBegin() 398 | while True: 399 | (fname, ftype, fid) = iprot.readFieldBegin() 400 | if ftype == TType.STOP: 401 | break 402 | if fid == 1: 403 | if ftype == TType.LIST: 404 | self.docs = [] 405 | (_etype10, _size7) = iprot.readListBegin() 406 | for _i11 in xrange(_size7): 407 | _elem12 = Doc_Info() 408 | _elem12.read(iprot) 409 | self.docs.append(_elem12) 410 | iprot.readListEnd() 411 | else: 412 | iprot.skip(ftype) 413 | elif fid == 2: 414 | if ftype == TType.I32: 415 | self.burn_in_iterations = iprot.readI32(); 416 | else: 417 | iprot.skip(ftype) 418 | elif fid == 3: 419 | if ftype == TType.I32: 420 | self.accumulating_iterations = iprot.readI32(); 421 | else: 422 | iprot.skip(ftype) 423 | elif fid == 4: 424 | if ftype == TType.I32: 425 | self.docnumoftopic = iprot.readI32(); 426 | else: 427 | iprot.skip(ftype) 428 | else: 429 | iprot.skip(ftype) 430 | iprot.readFieldEnd() 431 | iprot.readStructEnd() 432 | 433 | def write(self, oprot): 434 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 435 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 436 | return 437 | oprot.writeStructBegin('Infer_args') 438 | if self.docs is not None: 439 | oprot.writeFieldBegin('docs', TType.LIST, 1) 440 | oprot.writeListBegin(TType.STRUCT, len(self.docs)) 441 | for iter13 in self.docs: 442 | iter13.write(oprot) 443 | oprot.writeListEnd() 444 | oprot.writeFieldEnd() 445 | if self.burn_in_iterations is not None: 446 | oprot.writeFieldBegin('burn_in_iterations', TType.I32, 2) 447 | oprot.writeI32(self.burn_in_iterations) 448 | oprot.writeFieldEnd() 449 | if self.accumulating_iterations is not None: 450 | oprot.writeFieldBegin('accumulating_iterations', TType.I32, 3) 451 | oprot.writeI32(self.accumulating_iterations) 452 | oprot.writeFieldEnd() 453 | if self.docnumoftopic is not None: 454 | oprot.writeFieldBegin('docnumoftopic', TType.I32, 4) 455 | oprot.writeI32(self.docnumoftopic) 456 | oprot.writeFieldEnd() 457 | oprot.writeFieldStop() 458 | oprot.writeStructEnd() 459 | 460 | def validate(self): 461 | return 462 | 463 | 464 | def __hash__(self): 465 | value = 17 466 | value = (value * 31) ^ hash(self.docs) 467 | value = (value * 31) ^ hash(self.burn_in_iterations) 468 | value = (value * 31) ^ hash(self.accumulating_iterations) 469 | value = (value * 31) ^ hash(self.docnumoftopic) 470 | return value 471 | 472 | def __repr__(self): 473 | L = ['%s=%r' % (key, value) 474 | for key, value in self.__dict__.iteritems()] 475 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 476 | 477 | def __eq__(self, other): 478 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 479 | 480 | def __ne__(self, other): 481 | return not (self == other) 482 | 483 | class Infer_result: 484 | """ 485 | Attributes: 486 | - success 487 | """ 488 | 489 | thrift_spec = ( 490 | (0, TType.LIST, 'success', (TType.STRUCT,(Topic_Info, Topic_Info.thrift_spec)), None, ), # 0 491 | ) 492 | 493 | def __init__(self, success=None,): 494 | self.success = success 495 | 496 | def read(self, iprot): 497 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 498 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 499 | return 500 | iprot.readStructBegin() 501 | while True: 502 | (fname, ftype, fid) = iprot.readFieldBegin() 503 | if ftype == TType.STOP: 504 | break 505 | if fid == 0: 506 | if ftype == TType.LIST: 507 | self.success = [] 508 | (_etype17, _size14) = iprot.readListBegin() 509 | for _i18 in xrange(_size14): 510 | _elem19 = Topic_Info() 511 | _elem19.read(iprot) 512 | self.success.append(_elem19) 513 | iprot.readListEnd() 514 | else: 515 | iprot.skip(ftype) 516 | else: 517 | iprot.skip(ftype) 518 | iprot.readFieldEnd() 519 | iprot.readStructEnd() 520 | 521 | def write(self, oprot): 522 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 523 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 524 | return 525 | oprot.writeStructBegin('Infer_result') 526 | if self.success is not None: 527 | oprot.writeFieldBegin('success', TType.LIST, 0) 528 | oprot.writeListBegin(TType.STRUCT, len(self.success)) 529 | for iter20 in self.success: 530 | iter20.write(oprot) 531 | oprot.writeListEnd() 532 | oprot.writeFieldEnd() 533 | oprot.writeFieldStop() 534 | oprot.writeStructEnd() 535 | 536 | def validate(self): 537 | return 538 | 539 | 540 | def __hash__(self): 541 | value = 17 542 | value = (value * 31) ^ hash(self.success) 543 | return value 544 | 545 | def __repr__(self): 546 | L = ['%s=%r' % (key, value) 547 | for key, value in self.__dict__.iteritems()] 548 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 549 | 550 | def __eq__(self, other): 551 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 552 | 553 | def __ne__(self, other): 554 | return not (self == other) 555 | 556 | class GetTopics_args: 557 | """ 558 | Attributes: 559 | - docnumoftopic 560 | """ 561 | 562 | thrift_spec = ( 563 | None, # 0 564 | (1, TType.I32, 'docnumoftopic', None, 0, ), # 1 565 | ) 566 | 567 | def __init__(self, docnumoftopic=thrift_spec[1][4],): 568 | self.docnumoftopic = docnumoftopic 569 | 570 | def read(self, iprot): 571 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 572 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 573 | return 574 | iprot.readStructBegin() 575 | while True: 576 | (fname, ftype, fid) = iprot.readFieldBegin() 577 | if ftype == TType.STOP: 578 | break 579 | if fid == 1: 580 | if ftype == TType.I32: 581 | self.docnumoftopic = iprot.readI32(); 582 | else: 583 | iprot.skip(ftype) 584 | else: 585 | iprot.skip(ftype) 586 | iprot.readFieldEnd() 587 | iprot.readStructEnd() 588 | 589 | def write(self, oprot): 590 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 591 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 592 | return 593 | oprot.writeStructBegin('GetTopics_args') 594 | if self.docnumoftopic is not None: 595 | oprot.writeFieldBegin('docnumoftopic', TType.I32, 1) 596 | oprot.writeI32(self.docnumoftopic) 597 | oprot.writeFieldEnd() 598 | oprot.writeFieldStop() 599 | oprot.writeStructEnd() 600 | 601 | def validate(self): 602 | return 603 | 604 | 605 | def __hash__(self): 606 | value = 17 607 | value = (value * 31) ^ hash(self.docnumoftopic) 608 | return value 609 | 610 | def __repr__(self): 611 | L = ['%s=%r' % (key, value) 612 | for key, value in self.__dict__.iteritems()] 613 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 614 | 615 | def __eq__(self, other): 616 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 617 | 618 | def __ne__(self, other): 619 | return not (self == other) 620 | 621 | class GetTopics_result: 622 | """ 623 | Attributes: 624 | - success 625 | """ 626 | 627 | thrift_spec = ( 628 | (0, TType.LIST, 'success', (TType.STRUCT,(Topic_Info, Topic_Info.thrift_spec)), None, ), # 0 629 | ) 630 | 631 | def __init__(self, success=None,): 632 | self.success = success 633 | 634 | def read(self, iprot): 635 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 636 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 637 | return 638 | iprot.readStructBegin() 639 | while True: 640 | (fname, ftype, fid) = iprot.readFieldBegin() 641 | if ftype == TType.STOP: 642 | break 643 | if fid == 0: 644 | if ftype == TType.LIST: 645 | self.success = [] 646 | (_etype24, _size21) = iprot.readListBegin() 647 | for _i25 in xrange(_size21): 648 | _elem26 = Topic_Info() 649 | _elem26.read(iprot) 650 | self.success.append(_elem26) 651 | iprot.readListEnd() 652 | else: 653 | iprot.skip(ftype) 654 | else: 655 | iprot.skip(ftype) 656 | iprot.readFieldEnd() 657 | iprot.readStructEnd() 658 | 659 | def write(self, oprot): 660 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 661 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 662 | return 663 | oprot.writeStructBegin('GetTopics_result') 664 | if self.success is not None: 665 | oprot.writeFieldBegin('success', TType.LIST, 0) 666 | oprot.writeListBegin(TType.STRUCT, len(self.success)) 667 | for iter27 in self.success: 668 | iter27.write(oprot) 669 | oprot.writeListEnd() 670 | oprot.writeFieldEnd() 671 | oprot.writeFieldStop() 672 | oprot.writeStructEnd() 673 | 674 | def validate(self): 675 | return 676 | 677 | 678 | def __hash__(self): 679 | value = 17 680 | value = (value * 31) ^ hash(self.success) 681 | return value 682 | 683 | def __repr__(self): 684 | L = ['%s=%r' % (key, value) 685 | for key, value in self.__dict__.iteritems()] 686 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 687 | 688 | def __eq__(self, other): 689 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 690 | 691 | def __ne__(self, other): 692 | return not (self == other) 693 | 694 | class Sender2_args: 695 | """ 696 | Attributes: 697 | - docs 698 | """ 699 | 700 | thrift_spec = ( 701 | None, # 0 702 | (1, TType.LIST, 'docs', (TType.STRUCT,(Doc_Info, Doc_Info.thrift_spec)), None, ), # 1 703 | ) 704 | 705 | def __init__(self, docs=None,): 706 | self.docs = docs 707 | 708 | def read(self, iprot): 709 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 710 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 711 | return 712 | iprot.readStructBegin() 713 | while True: 714 | (fname, ftype, fid) = iprot.readFieldBegin() 715 | if ftype == TType.STOP: 716 | break 717 | if fid == 1: 718 | if ftype == TType.LIST: 719 | self.docs = [] 720 | (_etype31, _size28) = iprot.readListBegin() 721 | for _i32 in xrange(_size28): 722 | _elem33 = Doc_Info() 723 | _elem33.read(iprot) 724 | self.docs.append(_elem33) 725 | iprot.readListEnd() 726 | else: 727 | iprot.skip(ftype) 728 | else: 729 | iprot.skip(ftype) 730 | iprot.readFieldEnd() 731 | iprot.readStructEnd() 732 | 733 | def write(self, oprot): 734 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 735 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 736 | return 737 | oprot.writeStructBegin('Sender2_args') 738 | if self.docs is not None: 739 | oprot.writeFieldBegin('docs', TType.LIST, 1) 740 | oprot.writeListBegin(TType.STRUCT, len(self.docs)) 741 | for iter34 in self.docs: 742 | iter34.write(oprot) 743 | oprot.writeListEnd() 744 | oprot.writeFieldEnd() 745 | oprot.writeFieldStop() 746 | oprot.writeStructEnd() 747 | 748 | def validate(self): 749 | return 750 | 751 | 752 | def __hash__(self): 753 | value = 17 754 | value = (value * 31) ^ hash(self.docs) 755 | return value 756 | 757 | def __repr__(self): 758 | L = ['%s=%r' % (key, value) 759 | for key, value in self.__dict__.iteritems()] 760 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 761 | 762 | def __eq__(self, other): 763 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 764 | 765 | def __ne__(self, other): 766 | return not (self == other) 767 | -------------------------------------------------------------------------------- /gen-py/rpc/DocServlet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanjunxiao/tag_doc_with_lda/5ba0051a6d74d42b540f23f97722010929364c9c/gen-py/rpc/DocServlet.pyc -------------------------------------------------------------------------------- /gen-py/rpc/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'DocServlet'] 2 | -------------------------------------------------------------------------------- /gen-py/rpc/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanjunxiao/tag_doc_with_lda/5ba0051a6d74d42b540f23f97722010929364c9c/gen-py/rpc/__init__.pyc -------------------------------------------------------------------------------- /gen-py/rpc/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.9.2) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /gen-py/rpc/ttypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.9.2) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | 11 | from thrift.transport import TTransport 12 | from thrift.protocol import TBinaryProtocol, TProtocol 13 | try: 14 | from thrift.protocol import fastbinary 15 | except: 16 | fastbinary = None 17 | 18 | 19 | 20 | class Doc_Info: 21 | """ 22 | Attributes: 23 | - docid 24 | - text 25 | - consinesim 26 | """ 27 | 28 | thrift_spec = ( 29 | None, # 0 30 | (1, TType.STRING, 'docid', None, "", ), # 1 31 | (2, TType.STRING, 'text', None, None, ), # 2 32 | (3, TType.DOUBLE, 'consinesim', None, 0, ), # 3 33 | ) 34 | 35 | def __init__(self, docid=thrift_spec[1][4], text=None, consinesim=thrift_spec[3][4],): 36 | self.docid = docid 37 | self.text = text 38 | self.consinesim = consinesim 39 | 40 | def read(self, iprot): 41 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 42 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 43 | return 44 | iprot.readStructBegin() 45 | while True: 46 | (fname, ftype, fid) = iprot.readFieldBegin() 47 | if ftype == TType.STOP: 48 | break 49 | if fid == 1: 50 | if ftype == TType.STRING: 51 | self.docid = iprot.readString(); 52 | else: 53 | iprot.skip(ftype) 54 | elif fid == 2: 55 | if ftype == TType.STRING: 56 | self.text = iprot.readString(); 57 | else: 58 | iprot.skip(ftype) 59 | elif fid == 3: 60 | if ftype == TType.DOUBLE: 61 | self.consinesim = iprot.readDouble(); 62 | else: 63 | iprot.skip(ftype) 64 | else: 65 | iprot.skip(ftype) 66 | iprot.readFieldEnd() 67 | iprot.readStructEnd() 68 | 69 | def write(self, oprot): 70 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 71 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 72 | return 73 | oprot.writeStructBegin('Doc_Info') 74 | if self.docid is not None: 75 | oprot.writeFieldBegin('docid', TType.STRING, 1) 76 | oprot.writeString(self.docid) 77 | oprot.writeFieldEnd() 78 | if self.text is not None: 79 | oprot.writeFieldBegin('text', TType.STRING, 2) 80 | oprot.writeString(self.text) 81 | oprot.writeFieldEnd() 82 | if self.consinesim is not None: 83 | oprot.writeFieldBegin('consinesim', TType.DOUBLE, 3) 84 | oprot.writeDouble(self.consinesim) 85 | oprot.writeFieldEnd() 86 | oprot.writeFieldStop() 87 | oprot.writeStructEnd() 88 | 89 | def validate(self): 90 | return 91 | 92 | 93 | def __hash__(self): 94 | value = 17 95 | value = (value * 31) ^ hash(self.docid) 96 | value = (value * 31) ^ hash(self.text) 97 | value = (value * 31) ^ hash(self.consinesim) 98 | return value 99 | 100 | def __repr__(self): 101 | L = ['%s=%r' % (key, value) 102 | for key, value in self.__dict__.iteritems()] 103 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 104 | 105 | def __eq__(self, other): 106 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 107 | 108 | def __ne__(self, other): 109 | return not (self == other) 110 | 111 | class Topic_Info: 112 | """ 113 | Attributes: 114 | - topicid 115 | - topicwords 116 | - doclist 117 | """ 118 | 119 | thrift_spec = ( 120 | None, # 0 121 | (1, TType.I32, 'topicid', None, -1, ), # 1 122 | (2, TType.STRING, 'topicwords', None, "", ), # 2 123 | (3, TType.LIST, 'doclist', (TType.STRUCT,(Doc_Info, Doc_Info.thrift_spec)), [ 124 | ], ), # 3 125 | ) 126 | 127 | def __init__(self, topicid=thrift_spec[1][4], topicwords=thrift_spec[2][4], doclist=thrift_spec[3][4],): 128 | self.topicid = topicid 129 | self.topicwords = topicwords 130 | if doclist is self.thrift_spec[3][4]: 131 | doclist = [ 132 | ] 133 | self.doclist = doclist 134 | 135 | def read(self, iprot): 136 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 137 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 138 | return 139 | iprot.readStructBegin() 140 | while True: 141 | (fname, ftype, fid) = iprot.readFieldBegin() 142 | if ftype == TType.STOP: 143 | break 144 | if fid == 1: 145 | if ftype == TType.I32: 146 | self.topicid = iprot.readI32(); 147 | else: 148 | iprot.skip(ftype) 149 | elif fid == 2: 150 | if ftype == TType.STRING: 151 | self.topicwords = iprot.readString(); 152 | else: 153 | iprot.skip(ftype) 154 | elif fid == 3: 155 | if ftype == TType.LIST: 156 | self.doclist = [] 157 | (_etype3, _size0) = iprot.readListBegin() 158 | for _i4 in xrange(_size0): 159 | _elem5 = Doc_Info() 160 | _elem5.read(iprot) 161 | self.doclist.append(_elem5) 162 | iprot.readListEnd() 163 | else: 164 | iprot.skip(ftype) 165 | else: 166 | iprot.skip(ftype) 167 | iprot.readFieldEnd() 168 | iprot.readStructEnd() 169 | 170 | def write(self, oprot): 171 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 172 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 173 | return 174 | oprot.writeStructBegin('Topic_Info') 175 | if self.topicid is not None: 176 | oprot.writeFieldBegin('topicid', TType.I32, 1) 177 | oprot.writeI32(self.topicid) 178 | oprot.writeFieldEnd() 179 | if self.topicwords is not None: 180 | oprot.writeFieldBegin('topicwords', TType.STRING, 2) 181 | oprot.writeString(self.topicwords) 182 | oprot.writeFieldEnd() 183 | if self.doclist is not None: 184 | oprot.writeFieldBegin('doclist', TType.LIST, 3) 185 | oprot.writeListBegin(TType.STRUCT, len(self.doclist)) 186 | for iter6 in self.doclist: 187 | iter6.write(oprot) 188 | oprot.writeListEnd() 189 | oprot.writeFieldEnd() 190 | oprot.writeFieldStop() 191 | oprot.writeStructEnd() 192 | 193 | def validate(self): 194 | return 195 | 196 | 197 | def __hash__(self): 198 | value = 17 199 | value = (value * 31) ^ hash(self.topicid) 200 | value = (value * 31) ^ hash(self.topicwords) 201 | value = (value * 31) ^ hash(self.doclist) 202 | return value 203 | 204 | def __repr__(self): 205 | L = ['%s=%r' % (key, value) 206 | for key, value in self.__dict__.iteritems()] 207 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 208 | 209 | def __eq__(self, other): 210 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 211 | 212 | def __ne__(self, other): 213 | return not (self == other) 214 | -------------------------------------------------------------------------------- /gen-py/rpc/ttypes.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanjunxiao/tag_doc_with_lda/5ba0051a6d74d42b540f23f97722010929364c9c/gen-py/rpc/ttypes.pyc -------------------------------------------------------------------------------- /mylog.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import logging 4 | import logging.config 5 | 6 | 7 | formatter_dict = { 8 | 1 : logging.Formatter("%(message)s"), 9 | 2 : logging.Formatter("%(levelname)s - %(message)s"), 10 | 3 : logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"), 11 | 4 : logging.Formatter("%(asctime)s - %(levelname)s - %(message)s - [%(name)s]"), 12 | 5 : logging.Formatter("%(asctime)s - %(levelname)s - %(message)s - [%(name)s:%(lineno)s]") 13 | } 14 | class Logger(object): 15 | def __init__(self, logname, loglevel, callfile): 16 | ''' 17 | 指定日志文件路径,日志级别,以及调用文件 18 | 将日志存入到指定的文件中 19 | ''' 20 | self.logger = logging.getLogger(callfile) 21 | self.logger.setLevel(logging.DEBUG) 22 | self.fh = logging.FileHandler(logname) 23 | 24 | self.ch = logging.StreamHandler() 25 | self.ch.setLevel(logging.ERROR) 26 | self.ch.setFormatter(formatter_dict[int(loglevel)]) 27 | self.fh.setFormatter(formatter_dict[int(loglevel)]) 28 | self.logger.addHandler(self.ch) 29 | self.logger.addHandler(self.fh) 30 | 31 | def get_logger(self): 32 | return self.logger 33 | 34 | def close(self): 35 | #logging.shutdown() 36 | self.logger.removeHandler(self.fh) 37 | self.fh.flush() 38 | self.fh.close() 39 | 40 | self.logger.removeHandler(self.ch) 41 | self.ch.flush() 42 | self.ch.close() 43 | 44 | 45 | if __name__ == '__main__': 46 | """ 47 | logger = Logger(logname='hahaha', loglevel=1, callfile=__file__).get_logger() # 48 | logger.info('test level1') 49 | 50 | logger1 = Logger(logname='hahaha2', loglevel=2, callfile=__file__).get_logger() 51 | logger1.info('test level2') 52 | """ 53 | 54 | mylogger = Logger(logname='hahaha3', loglevel=3, callfile=__file__) 55 | logger2 = mylogger.get_logger() 56 | logger2.info('test level3') 57 | logger2.warn("this is warn") 58 | logger2.error("this is error") 59 | logger2.critical('this is critical') 60 | mylogger.close() 61 | 62 | 63 | -------------------------------------------------------------------------------- /rpc.thrift: -------------------------------------------------------------------------------- 1 | //include "doc.thrift" 2 | 3 | struct Doc_Info 4 | { 5 | 1: string docid = '' 6 | 2: string text 7 | 3: double consinesim = 0.0 8 | } 9 | 10 | typedef list Docs 11 | 12 | struct Topic_Info 13 | { 14 | 1: i32 topicid = -1 15 | 2: string topicwords = '' 16 | 3: Docs doclist = [] 17 | } 18 | 19 | typedef list Topics 20 | 21 | service DocServlet 22 | { 23 | string Ping(1: string sth); 24 | Topics Infer(1: Docs docs, 2: i32 burn_in_iterations=15, 3: i32 accumulating_iterations=10, 4: i32 docnumoftopic=15); 25 | Topics GetTopics(1: i32 docnumoftopic=0) 26 | oneway void Sender2(1: Docs docs); 27 | } 28 | -------------------------------------------------------------------------------- /start-infer_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bin=`dirname "$0"` 3 | bin=`cd "$bin";pwd` 4 | if [ -f "$bin"/.pid_infer.txt_ ]; then 5 | "$bin"/stop-infer_server.sh 6 | fi 7 | echo "`date +"%Y-%m-%d %H:%M:%S"` INFO: start INFER SERVER ... ..." >> "$bin"/.runtime.log 2>&1 8 | 9 | #. "$bin"/set_env.sh 10 | 11 | pushd "$bin" > /dev/null 12 | python "$bin"/Server.py >> "$bin"/.runtime.log 2>&1 & 13 | popd > /dev/null 14 | echo $! > "$bin"/.pid_infer.txt_ 15 | 16 | sleep 5 > /dev/null 17 | echo -e '\tINFER SERVER is successfully started!' >>"$bin"/.runtime.log 2>&1 18 | -------------------------------------------------------------------------------- /stop-infer_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bin=`dirname "$0"` 3 | bin=`cd "$bin";pwd` 4 | for i in `cat "$bin"/.pid_infer.txt_`; 5 | do 6 | ps awx -o "%p %P"|grep -w $i| awk '{ print $1 }'|xargs kill -9 7 | echo "`date +"%Y-%m-%d %H:%M:%S"`, INFO: INFER SERVER $i is stopped" 2>&1 | tee -a "$bin"/.runtime.log 8 | done; 9 | 10 | rm -rf "$bin"/.pid_infer.txt_ 11 | -------------------------------------------------------------------------------- /weibo_cluster.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #__author__ = 'nanjunxiao' 3 | 4 | import sys 5 | import os 6 | import time 7 | import math 8 | import pymongo 9 | import jieba 10 | import json 11 | from mylog import Logger 12 | from simhash import Simhash,SimhashIndex 13 | 14 | #========================================== 15 | timestep=86400 16 | mongoip='***'; mongoport=27017; mongodb='***'; mongocollection='***' 17 | mongouser='***'; mongopwd='***' 18 | 19 | exedirname=os.path.dirname(os.path.abspath(sys.argv[0]) ) 20 | stopwordsfilename = '%s/extra_dict/stop_words.utf8' %exedirname 21 | docfile='%s/result/weibo_doc.txt'%exedirname; cutfile='%s/result/weibo_cut.txt'%exedirname 22 | thetafile='%s/result/model-final.theta'%exedirname;topicfile='%s/result/model-final.twords'%exedirname 23 | 24 | topicnum=100;iternum=3000 25 | #========================================== 26 | def dict2json(djson): 27 | try: 28 | text = json.dumps(djson,ensure_ascii = False) 29 | except Exception,e: 30 | print >> sys.stderr, djson, e 31 | return '' 32 | return text 33 | 34 | def json2dict(text): 35 | try: 36 | djson = json.loads(text,encoding='utf-8') 37 | except Exception,e: 38 | print >> sys.stderr, text , e 39 | return None 40 | return djson 41 | 42 | '''consine''' 43 | def consinelen(sorted_list): 44 | powerone = 0.0 45 | for k, v in sorted_list: 46 | powerone += v * v 47 | return math.sqrt(powerone) 48 | 49 | def consine(dictone, dicttwo): 50 | if len(dictone) == 0 or len(dicttwo) == 0: 51 | return 0.0 52 | # sorted_one = sorted(dictone.items(),key=lambda d:d[0]) 53 | sorted_one = sorted(dictone.items()) 54 | sorted_two = sorted(dicttwo.items()) 55 | sum = 0.0 56 | i = 0 57 | j = 0 58 | while i < len(sorted_one) and j < len(sorted_two): 59 | if sorted_one[i][0] == sorted_two[j][0]: 60 | sum += sorted_one[i][1] * sorted_two[j][1] 61 | i += 1 62 | j += 1 63 | elif sorted_one[i][0] < sorted_two[j][0]: 64 | i += 1 65 | else: 66 | j += 1 67 | # if sum-0.0 < 0.001: 68 | if sum <= 0.0: 69 | return 0.0 70 | else: 71 | powerone = consinelen(sorted_one) 72 | powertwo = consinelen(sorted_two) 73 | if powerone > 0.0 and powertwo > 0.0: 74 | return sum / (powerone * powertwo) 75 | else: 76 | return 0.0 77 | 78 | """文档属于那个类""" 79 | def whichcluster(listweight): 80 | return max(enumerate(listweight), key=lambda x: x[1])[0] 81 | 82 | """文档向类中心向量叠加""" 83 | def addcenter(clustercenters, which, lineweight): 84 | for i in range(len(lineweight)): 85 | clustercenters[which][i] += lineweight[i] 86 | 87 | """adapter""" 88 | def cosineadapter(vlist, centerlist): 89 | vdict = {} 90 | centerdict = {} 91 | id = 0 92 | for i in range(len(vlist)): 93 | vdict[id] = vlist[i] 94 | centerdict[id] = centerlist[i] 95 | id += 1 96 | return consine(vdict, centerdict) 97 | 98 | """每个类中所有文档距中心sim""" 99 | def cossim2center(k, clusterdict_list, clustercenters): 100 | cossim_dict = {}#lineid-sim 101 | for i, vlist in clusterdict_list[k].items(): 102 | cossim = cosineadapter(vlist, clustercenters[k]) 103 | # print cossim 104 | cossim_dict[i]=cossim 105 | return sorted(cossim_dict.items(),key=lambda d:d[1],reverse=True) 106 | 107 | """topic,topic下文档按相似度降序排列""" 108 | def showtopicwithdocs(K,thetafile,topicfile,docfile,showfile): 109 | ftheta = open(thetafile,'r') 110 | ftopic = open(topicfile,'r') 111 | fdoc = open(docfile,'r') 112 | fshow = open(showfile,'w') 113 | 114 | clustercenters = []#[[],[] ] 115 | clusterdict_list = []#[{2:[w1,w2,w3,w4],3:[] },{}] 116 | for i in range(K): 117 | clustercenters.append([0.0]*K) 118 | clusterdict_list.append({}) 119 | lineno = 0 120 | for line in ftheta: 121 | line_list = line.strip().split() 122 | line_list = [float(w) for w in line_list] 123 | which = whichcluster(line_list) 124 | clusterdict_list[which][lineno] = line_list 125 | addcenter(clustercenters,which,line_list) 126 | lineno+=1 127 | ################ 128 | doc_list = fdoc.readlines() 129 | topicid = -1 130 | topicwords = '' 131 | for line in ftopic: 132 | if line.startswith('Topic'): 133 | if topicid==-1: 134 | topicid += 1;topicwords = '' 135 | continue 136 | else: 137 | #fshow.write(topicwords+'\n') 138 | #sorted_doc2center_sims = cossim2center(topicid,clusterdict_list,clustercenters) 139 | #for lineno,sim in sorted_doc2center_sims: 140 | #fshow.write('cosinesim:%f\tlineno:%d\n' %(sim,lineno)) 141 | #fshow.write('%s\n' %(doc_list[lineno].strip() )) 142 | #fshow.write('===========================================================\n') 143 | topic_json = {} 144 | topic_json['topicid'] = topicid; topic_json['topicwords'] = topicwords; topic_json['doclist'] = [] 145 | sorted_doc2center_sims = cossim2center(topicid,clusterdict_list,clustercenters) 146 | for lineno,sim in sorted_doc2center_sims: 147 | one_doc = doc_list[lineno].strip().split() 148 | topic_json['doclist'].append({'docid':one_doc[0],'consinesim':sim,'lineno':lineno,'text':one_doc[1] }) 149 | #print dict2json(topic_json) 150 | #sys.exit(1) 151 | fshow.write(dict2json(topic_json)+'\n' ) 152 | topicid += 1;topicwords = '' 153 | continue 154 | topicwords += line.strip().split()[0]+' ' 155 | 156 | topic_json = {} 157 | topic_json['topicid'] = topicid; topic_json['topicwords'] = topicwords; topic_json['doclist'] = [] 158 | sorted_doc2center_sims = cossim2center(topicid,clusterdict_list,clustercenters) 159 | for lineno,sim in sorted_doc2center_sims: 160 | one_doc = doc_list[lineno].strip().split() 161 | topic_json['doclist'].append({'docid':one_doc[0],'consinesim':sim,'lineno':lineno,'text':one_doc[1] }) 162 | fshow.write(dict2json(topic_json)+'\n' ) 163 | 164 | fshow.close() 165 | fdoc.close() 166 | ftopic.close() 167 | ftheta.close() 168 | 169 | def loadstopwords(filename): 170 | fp = open(filename, 'rb') 171 | stopwords = set() 172 | for line in fp: 173 | #jieba cut is unicode 174 | stopwords.add(line.strip('\n').decode('utf-8')) 175 | fp.close() 176 | return stopwords 177 | 178 | def getweibo_cut(curtimestamp,lasttimestamp): 179 | try: 180 | uri = 'mongodb://%s:%s@%s:%d/%s' % (mongouser,mongopwd,mongoip,mongoport,mongodb) 181 | logger.info('try to connect mongo. %s' %uri) 182 | connection = pymongo.MongoClient(uri) 183 | weibodb = connection.weibo_status 184 | weibocollection = weibodb.status 185 | except Exception,e: 186 | logger.critical('connect mongo error: %s' %(e) ) 187 | sys.exit(-1) 188 | 189 | logger.info('connect mongo ok.' ) 190 | 191 | try: 192 | logger.info('{create_time:{$gte:%ld,$lt:%ld} }' %(lasttimestamp,curtimestamp) ) 193 | status_count = weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }).count() 194 | logger.info('status_count: %d' %status_count) 195 | if status_count < 10: 196 | connection.close();mylogger.close() 197 | sys.exit(0) 198 | stopwords = loadstopwords(stopwordsfilename) 199 | fdoc=open(docfile,'w');fcut=open(cutfile,'w') 200 | num=0;simnum=0;cutnum=0 201 | #simhash 202 | index = SimhashIndex({}) 203 | for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }): 204 | weibo_id = str(one['_id']) 205 | weibo_text = one['data']['text'].strip() 206 | text_sh = Simhash(weibo_text ) 207 | if len(index.get_near_dups(text_sh) ) == 0: #not find sim 208 | #cut 209 | text_seg = jieba.cut(weibo_text) 210 | text_result = list(set(text_seg) - stopwords) 211 | content = ' 1 '.join(text_result) 212 | if content != '': 213 | fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n') 214 | cutnum += 1 215 | simnum += 1 216 | num += 1 217 | index.add(num,text_sh) 218 | except pymongo.errors,e: 219 | logger.critical('mongo find error: %s' %e) 220 | sys.exit(-2) 221 | 222 | logger.info('simnum: %d ' %simnum); 223 | logger.info('cutnum: %d ' %cutnum); 224 | connection.close() 225 | fdoc.close();fcut.close() 226 | 227 | def main(): 228 | curtimestamp=0;lasttimestamp=0 229 | if len(sys.argv)==4 and sys.argv[1]=='-BETime': 230 | lasttimestamp = long(sys.argv[2]) 231 | curtimestamp = long(sys.argv[3]) 232 | elif len(sys.argv)==2 and sys.argv[1]=='-SYSTime': 233 | curtimestamp = time.time() 234 | lasttimestamp = curtimestamp - timestep 235 | else: 236 | print 'usage: \n' \ 237 | 'exe [-BETime] [begintime] [endtime]\n' \ 238 | 'exe [-SYSTime]' 239 | sys.exit(-3) 240 | 241 | getweibo_cut(curtimestamp,lasttimestamp) 242 | logger.info('----------------------------------------------------->') 243 | logger.info('K:%d\tIter:%d\t%s\n' %(topicnum,iternum,time.asctime())) 244 | alpha = 50.0/topicnum 245 | #os.system('time %s/lda -est -ntopics %d -niters %d -savestep %d -twords %d -dfile %s'\ 246 | #%(exedirname,topicnum,iternum,10000,15,cutfile) ) 247 | 248 | logger.info('time %s/ompi_lda --num_topics=%d --alpha=%f --beta=0.1 --compute_loglikelihood=false --training_data_file=%s --model_file=%s/result/model_file --twords_file=%s --theta_file=%s --burn_in_iterations=%d --accumulating_iterations=%d --num_openmp_threads=8 '\ 249 | %(exedirname,topicnum,alpha,cutfile,exedirname,topicfile,thetafile,iternum,iternum*0.5) ) 250 | os.system('time %s/ompi_lda --num_topics=%d --alpha=%f --beta=0.1 --compute_loglikelihood=false --training_data_file=%s --model_file=%s/result/model_file --twords_file=%s --theta_file=%s --burn_in_iterations=%d --accumulating_iterations=%d --num_openmp_threads=8 '\ 251 | %(exedirname,topicnum,alpha,cutfile,exedirname,topicfile,thetafile,iternum,iternum*0.5) ) 252 | showfile='%s/result/%s-%s.result' %(exedirname, time.strftime('%Y%m%d%H%M%S',time.localtime(lasttimestamp) ),time.strftime('%Y%m%d%H%M%S',time.localtime(curtimestamp) ) ) 253 | showtopicwithdocs(topicnum,thetafile,topicfile,docfile,showfile) 254 | logger.info('K:%d\tIter:%d\t%s\n' %(topicnum,iternum,time.asctime())) 255 | 256 | if __name__ == '__main__': 257 | #logger 258 | mylogger = Logger(logname='%s/cluster.log'%exedirname, loglevel=3, callfile=__file__) 259 | logger = mylogger.get_logger() 260 | 261 | main() 262 | 263 | #logger 264 | mylogger.close() 265 | --------------------------------------------------------------------------------