├── .github └── FUNDING.yml ├── .gitignore ├── README.md └── site ├── __init__.py ├── __pycache__ └── __init__.cpython-34.pyc └── mybzz ├── StopWords.txt ├── __init__.py ├── __pycache__ └── __init__.cpython-34.pyc ├── crawler ├── 360Crawler.py ├── AppleCrawler.py ├── BaiDuCrawler.py ├── MZCrawler.py ├── TencentCrawler.py ├── WdjCrawler.py └── XiaoMiCrawler.py ├── domain ├── MzComment.py └── __pycache__ │ └── MzComment.cpython-34.pyc ├── keyword ├── CreateWordCloud.py └── InsertKeyWord.py ├── neg_review.pkl ├── pos_review.pkl ├── sentiment ├── Feature.py ├── Vec.py ├── feature_selection.py └── test1.txt ├── test ├── NltkUtil.py ├── Ran.py ├── Test.py ├── TestA.py ├── TestBs.py ├── TestDb.py ├── TestJieBa.py ├── TestNltk.py ├── TestPa.py ├── TestPkl.py ├── TestPred.py ├── TestQQ.py ├── TestReadFile.py ├── TestSk.py ├── TestWordCloud.py ├── model.m ├── model.pkl ├── neg_review.pkl └── pos_review.pkl └── util ├── BsUtil.py ├── DateUtil.py └── DbUtil.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | custom: # Replace with a single custom sponsorship URL 9 | 10 | #thx anyway 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *__pycache__* 3 | *.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CommentCrawler 2 | 抓取各大应用商店的评论爬虫 3 | -------------------------------------------------------------------------------- /site/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/__init__.py -------------------------------------------------------------------------------- /site/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /site/mybzz/StopWords.txt: -------------------------------------------------------------------------------- 1 | , 2 | 。 3 | ? 4 | ! 5 | ; 6 | “ 7 | # 8 | : 9 | ! 10 | " 11 | # 12 | $ 13 | % 14 | & 15 | ' 16 | ( 17 | ) 18 | * 19 | + 20 | , 21 | - 22 | -- 23 | . 24 | .. 25 | ... 26 | ...... 27 | ................... 28 | ./ 29 | .一 30 | .数 31 | .日 32 | / 33 | // 34 | 0 35 | 1 36 | 2 37 | 3 38 | 4 39 | 5 40 | 6 41 | 7 42 | 8 43 | 9 44 | : 45 | :// 46 | :: 47 | ; 48 | < 49 | = 50 | > 51 | >> 52 | ? 53 | @ 54 | A 55 | Lex 56 | [ 57 | \ 58 | ] 59 | ^ 60 | _ 61 | ` 62 | exp 63 | sub 64 | sup 65 | | 66 | } 67 | ~ 68 | ~~~~ 69 | · 70 | × 71 | ××× 72 | Δ 73 | Ψ 74 | γ 75 | μ 76 | φ 77 | φ. 78 | В 79 | — 80 | —— 81 | ——— 82 | ‘ 83 | ’ 84 | ’‘ 85 | “ 86 | ” 87 | ”, 88 | … 89 | …… 90 | …………………………………………………③ 91 | ′∈ 92 | ′| 93 | ℃ 94 | Ⅲ 95 | ↑ 96 | → 97 | ∈[ 98 | ∪φ∈ 99 | ≈ 100 | ① 101 | ② 102 | ②c 103 | ③ 104 | ③] 105 | ④ 106 | ⑤ 107 | ⑥ 108 | ⑦ 109 | ⑧ 110 | ⑨ 111 | ⑩ 112 | ── 113 | ■ 114 | ▲ 115 |   116 | 、 117 | 。 118 | 〈 119 | 〉 120 | 《 121 | 》 122 | 》), 123 | 」 124 | 『 125 | 』 126 | 【 127 | 】 128 | 〔 129 | 〕 130 | 〕〔 131 | ㈧ 132 | 一 133 | 一. 134 | 一一 135 | 一下 136 | 一个 137 | 一些 138 | 一何 139 | 一切 140 | 一则 141 | 一则通过 142 | 一天 143 | 一定 144 | 一方面 145 | 一旦 146 | 一时 147 | 一来 148 | 一样 149 | 一次 150 | 一片 151 | 一番 152 | 一直 153 | 一致 154 | 一般 155 | 一起 156 | 一转眼 157 | 一边 158 | 一面 159 | 七 160 | 万一 161 | 三 162 | 三天两头 163 | 三番两次 164 | 三番五次 165 | 上 166 | 上下 167 | 上升 168 | 上去 169 | 上来 170 | 上述 171 | 上面 172 | 下 173 | 下列 174 | 下去 175 | 下来 176 | 下面 177 | 不 178 | 不一 179 | 不下 180 | 不久 181 | 不了 182 | 不亦乐乎 183 | 不仅 184 | 不仅...而且 185 | 不仅仅 186 | 不仅仅是 187 | 不会 188 | 不但 189 | 不但...而且 190 | 不光 191 | 不免 192 | 不再 193 | 不力 194 | 不单 195 | 不变 196 | 不只 197 | 不可 198 | 不可开交 199 | 不可抗拒 200 | 不同 201 | 不外 202 | 不外乎 203 | 不够 204 | 不大 205 | 不如 206 | 不妨 207 | 不定 208 | 不对 209 | 不少 210 | 不尽 211 | 不尽然 212 | 不巧 213 | 不已 214 | 不常 215 | 不得 216 | 不得不 217 | 不得了 218 | 不得已 219 | 不必 220 | 不怎么 221 | 不怕 222 | 不惟 223 | 不成 224 | 不拘 225 | 不择手段 226 | 不敢 227 | 不料 228 | 不断 229 | 不日 230 | 不时 231 | 不是 232 | 不曾 233 | 不止 234 | 不止一次 235 | 不比 236 | 不消 237 | 不满 238 | 不然 239 | 不然的话 240 | 不特 241 | 不独 242 | 不由得 243 | 不知不觉 244 | 不管 245 | 不管怎样 246 | 不经意 247 | 不胜 248 | 不能 249 | 不能不 250 | 不至于 251 | 不若 252 | 不要 253 | 不论 254 | 不起 255 | 不足 256 | 不过 257 | 不迭 258 | 不问 259 | 不限 260 | 与 261 | 与其 262 | 与其说 263 | 与否 264 | 与此同时 265 | 专门 266 | 且 267 | 且不说 268 | 且说 269 | 两者 270 | 严格 271 | 严重 272 | 个 273 | 个人 274 | 个别 275 | 中小 276 | 中间 277 | 丰富 278 | 串行 279 | 临 280 | 临到 281 | 为 282 | 为主 283 | 为了 284 | 为什么 285 | 为什麽 286 | 为何 287 | 为止 288 | 为此 289 | 为着 290 | 主张 291 | 主要 292 | 举凡 293 | 举行 294 | 乃 295 | 乃至 296 | 乃至于 297 | 么 298 | 之 299 | 之一 300 | 之前 301 | 之后 302 | 之後 303 | 之所以 304 | 之类 305 | 乌乎 306 | 乎 307 | 乒 308 | 乘 309 | 乘势 310 | 乘机 311 | 乘胜 312 | 乘虚 313 | 乘隙 314 | 九 315 | 也 316 | 也好 317 | 也就是说 318 | 也是 319 | 也罢 320 | 了 321 | 了解 322 | 争取 323 | 二 324 | 二来 325 | 二话不说 326 | 二话没说 327 | 于 328 | 于是 329 | 于是乎 330 | 云云 331 | 云尔 332 | 互 333 | 互相 334 | 五 335 | 些 336 | 交口 337 | 亦 338 | 产生 339 | 亲口 340 | 亲手 341 | 亲眼 342 | 亲自 343 | 亲身 344 | 人 345 | 人人 346 | 人们 347 | 人家 348 | 人民 349 | 什么 350 | 什么样 351 | 什麽 352 | 仅 353 | 仅仅 354 | 今 355 | 今后 356 | 今天 357 | 今年 358 | 今後 359 | 介于 360 | 仍 361 | 仍旧 362 | 仍然 363 | 从 364 | 从不 365 | 从严 366 | 从中 367 | 从事 368 | 从今以后 369 | 从优 370 | 从古到今 371 | 从古至今 372 | 从头 373 | 从宽 374 | 从小 375 | 从新 376 | 从无到有 377 | 从早到晚 378 | 从未 379 | 从来 380 | 从此 381 | 从此以后 382 | 从而 383 | 从轻 384 | 从速 385 | 从重 386 | 他 387 | 他人 388 | 他们 389 | 他是 390 | 他的 391 | 代替 392 | 以 393 | 以上 394 | 以下 395 | 以为 396 | 以便 397 | 以免 398 | 以前 399 | 以及 400 | 以后 401 | 以外 402 | 以後 403 | 以故 404 | 以期 405 | 以来 406 | 以至 407 | 以至于 408 | 以致 409 | 们 410 | 任 411 | 任何 412 | 任凭 413 | 任务 414 | 企图 415 | 伙同 416 | 会 417 | 伟大 418 | 传 419 | 传说 420 | 传闻 421 | 似乎 422 | 似的 423 | 但 424 | 但凡 425 | 但愿 426 | 但是 427 | 何 428 | 何乐而不为 429 | 何以 430 | 何况 431 | 何处 432 | 何妨 433 | 何尝 434 | 何必 435 | 何时 436 | 何止 437 | 何苦 438 | 何须 439 | 余外 440 | 作为 441 | 你 442 | 你们 443 | 你是 444 | 你的 445 | 使 446 | 使得 447 | 使用 448 | 例如 449 | 依 450 | 依据 451 | 依照 452 | 依靠 453 | 便 454 | 便于 455 | 促进 456 | 保持 457 | 保管 458 | 保险 459 | 俺 460 | 俺们 461 | 倍加 462 | 倍感 463 | 倒不如 464 | 倒不如说 465 | 倒是 466 | 倘 467 | 倘使 468 | 倘或 469 | 倘然 470 | 倘若 471 | 借 472 | 借以 473 | 借此 474 | 假使 475 | 假如 476 | 假若 477 | 偏偏 478 | 做到 479 | 偶尔 480 | 偶而 481 | 傥然 482 | 像 483 | 儿 484 | 允许 485 | 元/吨 486 | 充其极 487 | 充其量 488 | 充分 489 | 先不先 490 | 先后 491 | 先後 492 | 先生 493 | 光 494 | 光是 495 | 全体 496 | 全力 497 | 全年 498 | 全然 499 | 全身心 500 | 全部 501 | 全都 502 | 全面 503 | 八 504 | 八成 505 | 公然 506 | 六 507 | 兮 508 | 共 509 | 共同 510 | 共总 511 | 关于 512 | 其 513 | 其一 514 | 其中 515 | 其二 516 | 其他 517 | 其余 518 | 其后 519 | 其它 520 | 其实 521 | 其次 522 | 具体 523 | 具体地说 524 | 具体来说 525 | 具体说来 526 | 具有 527 | 兼之 528 | 内 529 | 再 530 | 再其次 531 | 再则 532 | 再有 533 | 再次 534 | 再者 535 | 再者说 536 | 再说 537 | 冒 538 | 冲 539 | 决不 540 | 决定 541 | 决非 542 | 况且 543 | 准备 544 | 凑巧 545 | 凝神 546 | 几 547 | 几乎 548 | 几度 549 | 几时 550 | 几番 551 | 几经 552 | 凡 553 | 凡是 554 | 凭 555 | 凭借 556 | 出 557 | 出于 558 | 出去 559 | 出来 560 | 出现 561 | 分别 562 | 分头 563 | 分期 564 | 分期分批 565 | 切 566 | 切不可 567 | 切切 568 | 切勿 569 | 切莫 570 | 则 571 | 则甚 572 | 刚 573 | 刚好 574 | 刚巧 575 | 刚才 576 | 初 577 | 别 578 | 别人 579 | 别处 580 | 别是 581 | 别的 582 | 别管 583 | 别说 584 | 到 585 | 到了儿 586 | 到处 587 | 到头 588 | 到头来 589 | 到底 590 | 到目前为止 591 | 前后 592 | 前此 593 | 前者 594 | 前进 595 | 前面 596 | 加上 597 | 加之 598 | 加以 599 | 加入 600 | 加强 601 | 动不动 602 | 动辄 603 | 勃然 604 | 匆匆 605 | 十分 606 | 千 607 | 千万 608 | 千万千万 609 | 半 610 | 单 611 | 单单 612 | 单纯 613 | 即 614 | 即令 615 | 即使 616 | 即便 617 | 即刻 618 | 即如 619 | 即将 620 | 即或 621 | 即是说 622 | 即若 623 | 却 624 | 却不 625 | 历 626 | 原来 627 | 去 628 | 又 629 | 又及 630 | 及 631 | 及其 632 | 及时 633 | 及至 634 | 双方 635 | 反之 636 | 反之亦然 637 | 反之则 638 | 反倒 639 | 反倒是 640 | 反应 641 | 反手 642 | 反映 643 | 反而 644 | 反过来 645 | 反过来说 646 | 取得 647 | 取道 648 | 受到 649 | 变成 650 | 古来 651 | 另 652 | 另一个 653 | 另一方面 654 | 另外 655 | 另悉 656 | 另方面 657 | 另行 658 | 只 659 | 只当 660 | 只怕 661 | 只是 662 | 只有 663 | 只消 664 | 只要 665 | 只限 666 | 叫 667 | 叫做 668 | 召开 669 | 叮咚 670 | 叮当 671 | 可 672 | 可以 673 | 可好 674 | 可是 675 | 可能 676 | 可见 677 | 各 678 | 各个 679 | 各人 680 | 各位 681 | 各地 682 | 各式 683 | 各种 684 | 各级 685 | 各自 686 | 合理 687 | 同 688 | 同一 689 | 同时 690 | 同样 691 | 后 692 | 后来 693 | 后者 694 | 后面 695 | 向 696 | 向使 697 | 向着 698 | 吓 699 | 吗 700 | 否则 701 | 吧 702 | 吧哒 703 | 吱 704 | 呀 705 | 呃 706 | 呆呆地 707 | 呐 708 | 呕 709 | 呗 710 | 呜 711 | 呜呼 712 | 呢 713 | 周围 714 | 呵 715 | 呵呵 716 | 呸 717 | 呼哧 718 | 呼啦 719 | 咋 720 | 和 721 | 咚 722 | 咦 723 | 咧 724 | 咱 725 | 咱们 726 | 咳 727 | 哇 728 | 哈 729 | 哈哈 730 | 哉 731 | 哎 732 | 哎呀 733 | 哎哟 734 | 哗 735 | 哗啦 736 | 哟 737 | 哦 738 | 哩 739 | 哪 740 | 哪个 741 | 哪些 742 | 哪儿 743 | 哪天 744 | 哪年 745 | 哪怕 746 | 哪样 747 | 哪边 748 | 哪里 749 | 哼 750 | 哼唷 751 | 唉 752 | 唯有 753 | 啊 754 | 啊呀 755 | 啊哈 756 | 啊哟 757 | 啐 758 | 啥 759 | 啦 760 | 啪达 761 | 啷当 762 | 喀 763 | 喂 764 | 喏 765 | 喔唷 766 | 喽 767 | 嗡 768 | 嗡嗡 769 | 嗬 770 | 嗯 771 | 嗳 772 | 嘎 773 | 嘎嘎 774 | 嘎登 775 | 嘘 776 | 嘛 777 | 嘻 778 | 嘿 779 | 嘿嘿 780 | 四 781 | 因 782 | 因为 783 | 因了 784 | 因此 785 | 因着 786 | 因而 787 | 固 788 | 固然 789 | 在 790 | 在下 791 | 在于 792 | 地 793 | 均 794 | 坚决 795 | 坚持 796 | 基于 797 | 基本 798 | 基本上 799 | 处在 800 | 处处 801 | 处理 802 | 复杂 803 | 多 804 | 多么 805 | 多亏 806 | 多多 807 | 多多少少 808 | 多多益善 809 | 多少 810 | 多年前 811 | 多年来 812 | 多数 813 | 多次 814 | 够瞧的 815 | 大 816 | 大不了 817 | 大举 818 | 大事 819 | 大体 820 | 大体上 821 | 大凡 822 | 大力 823 | 大多 824 | 大多数 825 | 大大 826 | 大家 827 | 大张旗鼓 828 | 大批 829 | 大抵 830 | 大概 831 | 大略 832 | 大约 833 | 大致 834 | 大都 835 | 大量 836 | 大面儿上 837 | 失去 838 | 奇 839 | 奈 840 | 奋勇 841 | 她 842 | 她们 843 | 她是 844 | 她的 845 | 好在 846 | 好的 847 | 好象 848 | 如 849 | 如上 850 | 如上所述 851 | 如下 852 | 如今 853 | 如何 854 | 如其 855 | 如前所述 856 | 如同 857 | 如常 858 | 如是 859 | 如期 860 | 如果 861 | 如次 862 | 如此 863 | 如此等等 864 | 如若 865 | 始而 866 | 姑且 867 | 存在 868 | 存心 869 | 孰料 870 | 孰知 871 | 宁 872 | 宁可 873 | 宁愿 874 | 宁肯 875 | 它 876 | 它们 877 | 它们的 878 | 它是 879 | 它的 880 | 安全 881 | 完全 882 | 完成 883 | 定 884 | 实现 885 | 实际 886 | 宣布 887 | 容易 888 | 密切 889 | 对 890 | 对于 891 | 对应 892 | 对待 893 | 对方 894 | 对比 895 | 将 896 | 将才 897 | 将要 898 | 将近 899 | 小 900 | 少数 901 | 尔 902 | 尔后 903 | 尔尔 904 | 尔等 905 | 尚且 906 | 尤其 907 | 就 908 | 就地 909 | 就是 910 | 就是了 911 | 就是说 912 | 就此 913 | 就算 914 | 就要 915 | 尽 916 | 尽可能 917 | 尽如人意 918 | 尽心尽力 919 | 尽心竭力 920 | 尽快 921 | 尽早 922 | 尽然 923 | 尽管 924 | 尽管如此 925 | 尽量 926 | 局外 927 | 居然 928 | 届时 929 | 属于 930 | 屡 931 | 屡屡 932 | 屡次 933 | 屡次三番 934 | 岂 935 | 岂但 936 | 岂止 937 | 岂非 938 | 川流不息 939 | 左右 940 | 巨大 941 | 巩固 942 | 差一点 943 | 差不多 944 | 己 945 | 已 946 | 已矣 947 | 已经 948 | 巴 949 | 巴巴 950 | 带 951 | 帮助 952 | 常 953 | 常常 954 | 常言说 955 | 常言说得好 956 | 常言道 957 | 平素 958 | 年复一年 959 | 并 960 | 并不 961 | 并不是 962 | 并且 963 | 并排 964 | 并无 965 | 并没 966 | 并没有 967 | 并肩 968 | 并非 969 | 广大 970 | 广泛 971 | 应当 972 | 应用 973 | 应该 974 | 庶乎 975 | 庶几 976 | 开外 977 | 开始 978 | 开展 979 | 引起 980 | 弗 981 | 弹指之间 982 | 强烈 983 | 强调 984 | 归 985 | 归根到底 986 | 归根结底 987 | 归齐 988 | 当 989 | 当下 990 | 当中 991 | 当儿 992 | 当前 993 | 当即 994 | 当口儿 995 | 当地 996 | 当场 997 | 当头 998 | 当庭 999 | 当时 1000 | 当然 1001 | 当真 1002 | 当着 1003 | 形成 1004 | 彻夜 1005 | 彻底 1006 | 彼 1007 | 彼时 1008 | 彼此 1009 | 往 1010 | 往往 1011 | 待 1012 | 待到 1013 | 很 1014 | 很多 1015 | 很少 1016 | 後来 1017 | 後面 1018 | 得 1019 | 得了 1020 | 得出 1021 | 得到 1022 | 得天独厚 1023 | 得起 1024 | 心里 1025 | 必 1026 | 必定 1027 | 必将 1028 | 必然 1029 | 必要 1030 | 必须 1031 | 快 1032 | 快要 1033 | 忽地 1034 | 忽然 1035 | 怎 1036 | 怎么 1037 | 怎么办 1038 | 怎么样 1039 | 怎奈 1040 | 怎样 1041 | 怎麽 1042 | 怕 1043 | 急匆匆 1044 | 怪 1045 | 怪不得 1046 | 总之 1047 | 总是 1048 | 总的来看 1049 | 总的来说 1050 | 总的说来 1051 | 总结 1052 | 总而言之 1053 | 恍然 1054 | 恐怕 1055 | 恰似 1056 | 恰好 1057 | 恰如 1058 | 恰巧 1059 | 恰恰 1060 | 恰恰相反 1061 | 恰逢 1062 | 您 1063 | 您们 1064 | 您是 1065 | 惟其 1066 | 惯常 1067 | 意思 1068 | 愤然 1069 | 愿意 1070 | 慢说 1071 | 成为 1072 | 成年 1073 | 成年累月 1074 | 成心 1075 | 我 1076 | 我们 1077 | 我是 1078 | 我的 1079 | 或 1080 | 或则 1081 | 或多或少 1082 | 或是 1083 | 或曰 1084 | 或者 1085 | 或许 1086 | 战斗 1087 | 截然 1088 | 截至 1089 | 所 1090 | 所以 1091 | 所在 1092 | 所幸 1093 | 所有 1094 | 所谓 1095 | 才 1096 | 才能 1097 | 扑通 1098 | 打 1099 | 打从 1100 | 打开天窗说亮话 1101 | 扩大 1102 | 把 1103 | 抑或 1104 | 抽冷子 1105 | 拦腰 1106 | 拿 1107 | 按 1108 | 按时 1109 | 按期 1110 | 按照 1111 | 按理 1112 | 按说 1113 | 挨个 1114 | 挨家挨户 1115 | 挨次 1116 | 挨着 1117 | 挨门挨户 1118 | 挨门逐户 1119 | 换句话说 1120 | 换言之 1121 | 据 1122 | 据实 1123 | 据悉 1124 | 据我所知 1125 | 据此 1126 | 据称 1127 | 据说 1128 | 掌握 1129 | 接下来 1130 | 接着 1131 | 接著 1132 | 接连不断 1133 | 放量 1134 | 故 1135 | 故意 1136 | 故此 1137 | 故而 1138 | 敞开儿 1139 | 敢 1140 | 敢于 1141 | 敢情 1142 | 数/ 1143 | 整个 1144 | 断然 1145 | 方 1146 | 方便 1147 | 方才 1148 | 方能 1149 | 方面 1150 | 旁人 1151 | 无 1152 | 无宁 1153 | 无法 1154 | 无论 1155 | 既 1156 | 既...又 1157 | 既往 1158 | 既是 1159 | 既然 1160 | 日复一日 1161 | 日渐 1162 | 日益 1163 | 日臻 1164 | 日见 1165 | 时候 1166 | 昂然 1167 | 明显 1168 | 明确 1169 | 是 1170 | 是不是 1171 | 是以 1172 | 是否 1173 | 是的 1174 | 显然 1175 | 显著 1176 | 普通 1177 | 普遍 1178 | 暗中 1179 | 暗地里 1180 | 暗自 1181 | 更 1182 | 更为 1183 | 更加 1184 | 更进一步 1185 | 曾 1186 | 曾经 1187 | 替 1188 | 替代 1189 | 最 1190 | 最后 1191 | 最大 1192 | 最好 1193 | 最後 1194 | 最近 1195 | 最高 1196 | 有 1197 | 有些 1198 | 有关 1199 | 有利 1200 | 有力 1201 | 有及 1202 | 有所 1203 | 有效 1204 | 有时 1205 | 有点 1206 | 有的 1207 | 有的是 1208 | 有着 1209 | 有著 1210 | 望 1211 | 朝 1212 | 朝着 1213 | 末##末 1214 | 本 1215 | 本人 1216 | 本地 1217 | 本着 1218 | 本身 1219 | 权时 1220 | 来 1221 | 来不及 1222 | 来得及 1223 | 来看 1224 | 来着 1225 | 来自 1226 | 来讲 1227 | 来说 1228 | 极 1229 | 极为 1230 | 极了 1231 | 极其 1232 | 极力 1233 | 极大 1234 | 极度 1235 | 极端 1236 | 构成 1237 | 果然 1238 | 果真 1239 | 某 1240 | 某个 1241 | 某些 1242 | 某某 1243 | 根据 1244 | 根本 1245 | 格外 1246 | 梆 1247 | 概 1248 | 次第 1249 | 欢迎 1250 | 欤 1251 | 正值 1252 | 正在 1253 | 正如 1254 | 正巧 1255 | 正常 1256 | 正是 1257 | 此 1258 | 此中 1259 | 此后 1260 | 此地 1261 | 此处 1262 | 此外 1263 | 此时 1264 | 此次 1265 | 此间 1266 | 殆 1267 | 毋宁 1268 | 每 1269 | 每个 1270 | 每天 1271 | 每年 1272 | 每当 1273 | 每时每刻 1274 | 每每 1275 | 每逢 1276 | 比 1277 | 比及 1278 | 比如 1279 | 比如说 1280 | 比方 1281 | 比照 1282 | 比起 1283 | 比较 1284 | 毕竟 1285 | 毫不 1286 | 毫无 1287 | 毫无例外 1288 | 毫无保留地 1289 | 汝 1290 | 沙沙 1291 | 没 1292 | 没奈何 1293 | 没有 1294 | 沿 1295 | 沿着 1296 | 注意 1297 | 活 1298 | 深入 1299 | 清楚 1300 | 满 1301 | 满足 1302 | 漫说 1303 | 焉 1304 | 然 1305 | 然则 1306 | 然后 1307 | 然後 1308 | 然而 1309 | 照 1310 | 照着 1311 | 牢牢 1312 | 特别是 1313 | 特殊 1314 | 特点 1315 | 犹且 1316 | 犹自 1317 | 独 1318 | 独自 1319 | 猛然 1320 | 猛然间 1321 | 率尔 1322 | 率然 1323 | 现代 1324 | 现在 1325 | 理应 1326 | 理当 1327 | 理该 1328 | 瑟瑟 1329 | 甚且 1330 | 甚么 1331 | 甚或 1332 | 甚而 1333 | 甚至 1334 | 甚至于 1335 | 用 1336 | 用来 1337 | 甫 1338 | 甭 1339 | 由 1340 | 由于 1341 | 由是 1342 | 由此 1343 | 由此可见 1344 | 略 1345 | 略为 1346 | 略加 1347 | 略微 1348 | 白 1349 | 白白 1350 | 的 1351 | 的确 1352 | 的话 1353 | 皆可 1354 | 目前 1355 | 直到 1356 | 直接 1357 | 相似 1358 | 相信 1359 | 相反 1360 | 相同 1361 | 相对 1362 | 相对而言 1363 | 相应 1364 | 相当 1365 | 相等 1366 | 省得 1367 | 看 1368 | 看上去 1369 | 看出 1370 | 看到 1371 | 看来 1372 | 看样子 1373 | 看看 1374 | 看见 1375 | 看起来 1376 | 真是 1377 | 真正 1378 | 眨眼 1379 | 着 1380 | 着呢 1381 | 矣 1382 | 矣乎 1383 | 矣哉 1384 | 知道 1385 | 砰 1386 | 确定 1387 | 碰巧 1388 | 社会主义 1389 | 离 1390 | 种 1391 | 积极 1392 | 移动 1393 | 究竟 1394 | 穷年累月 1395 | 突出 1396 | 突然 1397 | 窃 1398 | 立 1399 | 立刻 1400 | 立即 1401 | 立地 1402 | 立时 1403 | 立马 1404 | 竟 1405 | 竟然 1406 | 竟而 1407 | 第 1408 | 第二 1409 | 等 1410 | 等到 1411 | 等等 1412 | 策略地 1413 | 简直 1414 | 简而言之 1415 | 简言之 1416 | 管 1417 | 类如 1418 | 粗 1419 | 精光 1420 | 紧接着 1421 | 累年 1422 | 累次 1423 | 纯 1424 | 纯粹 1425 | 纵 1426 | 纵令 1427 | 纵使 1428 | 纵然 1429 | 练习 1430 | 组成 1431 | 经 1432 | 经常 1433 | 经过 1434 | 结合 1435 | 结果 1436 | 给 1437 | 绝 1438 | 绝不 1439 | 绝对 1440 | 绝非 1441 | 绝顶 1442 | 继之 1443 | 继后 1444 | 继续 1445 | 继而 1446 | 维持 1447 | 综上所述 1448 | 缕缕 1449 | 罢了 1450 | 老 1451 | 老大 1452 | 老是 1453 | 老老实实 1454 | 考虑 1455 | 者 1456 | 而 1457 | 而且 1458 | 而况 1459 | 而又 1460 | 而后 1461 | 而外 1462 | 而已 1463 | 而是 1464 | 而言 1465 | 而论 1466 | 联系 1467 | 联袂 1468 | 背地里 1469 | 背靠背 1470 | 能 1471 | 能否 1472 | 能够 1473 | 腾 1474 | 自 1475 | 自个儿 1476 | 自从 1477 | 自各儿 1478 | 自后 1479 | 自家 1480 | 自己 1481 | 自打 1482 | 自身 1483 | 臭 1484 | 至 1485 | 至于 1486 | 至今 1487 | 至若 1488 | 致 1489 | 般的 1490 | 良好 1491 | 若 1492 | 若夫 1493 | 若是 1494 | 若果 1495 | 若非 1496 | 范围 1497 | 莫 1498 | 莫不 1499 | 莫不然 1500 | 莫如 1501 | 莫若 1502 | 莫非 1503 | 获得 1504 | 藉以 1505 | 虽 1506 | 虽则 1507 | 虽然 1508 | 虽说 1509 | 蛮 1510 | 行为 1511 | 行动 1512 | 表明 1513 | 表示 1514 | 被 1515 | 要 1516 | 要不 1517 | 要不是 1518 | 要不然 1519 | 要么 1520 | 要是 1521 | 要求 1522 | 见 1523 | 规定 1524 | 觉得 1525 | 譬喻 1526 | 譬如 1527 | 认为 1528 | 认真 1529 | 认识 1530 | 让 1531 | 许多 1532 | 论 1533 | 论说 1534 | 设使 1535 | 设或 1536 | 设若 1537 | 诚如 1538 | 诚然 1539 | 话说 1540 | 该 1541 | 该当 1542 | 说明 1543 | 说来 1544 | 说说 1545 | 请勿 1546 | 诸 1547 | 诸位 1548 | 诸如 1549 | 谁 1550 | 谁人 1551 | 谁料 1552 | 谁知 1553 | 谨 1554 | 豁然 1555 | 贼死 1556 | 赖以 1557 | 赶 1558 | 赶快 1559 | 赶早不赶晚 1560 | 起 1561 | 起先 1562 | 起初 1563 | 起头 1564 | 起来 1565 | 起见 1566 | 起首 1567 | 趁 1568 | 趁便 1569 | 趁势 1570 | 趁早 1571 | 趁机 1572 | 趁热 1573 | 趁着 1574 | 越是 1575 | 距 1576 | 跟 1577 | 路经 1578 | 转动 1579 | 转变 1580 | 转贴 1581 | 轰然 1582 | 较 1583 | 较为 1584 | 较之 1585 | 较比 1586 | 边 1587 | 达到 1588 | 达旦 1589 | 迄 1590 | 迅速 1591 | 过 1592 | 过于 1593 | 过去 1594 | 过来 1595 | 运用 1596 | 近 1597 | 近几年来 1598 | 近年来 1599 | 近来 1600 | 还 1601 | 还是 1602 | 还有 1603 | 还要 1604 | 这 1605 | 这一来 1606 | 这个 1607 | 这么 1608 | 这么些 1609 | 这么样 1610 | 这么点儿 1611 | 这些 1612 | 这会儿 1613 | 这儿 1614 | 这就是说 1615 | 这时 1616 | 这样 1617 | 这次 1618 | 这点 1619 | 这种 1620 | 这般 1621 | 这边 1622 | 这里 1623 | 这麽 1624 | 进入 1625 | 进去 1626 | 进来 1627 | 进步 1628 | 进而 1629 | 进行 1630 | 连 1631 | 连同 1632 | 连声 1633 | 连日 1634 | 连日来 1635 | 连袂 1636 | 连连 1637 | 迟早 1638 | 迫于 1639 | 适应 1640 | 适当 1641 | 适用 1642 | 逐步 1643 | 逐渐 1644 | 通常 1645 | 通过 1646 | 造成 1647 | 逢 1648 | 遇到 1649 | 遭到 1650 | 遵循 1651 | 遵照 1652 | 避免 1653 | 那 1654 | 那个 1655 | 那么 1656 | 那么些 1657 | 那么样 1658 | 那些 1659 | 那会儿 1660 | 那儿 1661 | 那时 1662 | 那末 1663 | 那样 1664 | 那般 1665 | 那边 1666 | 那里 1667 | 那麽 1668 | 部分 1669 | 都 1670 | 鄙人 1671 | 采取 1672 | 里面 1673 | 重大 1674 | 重新 1675 | 重要 1676 | 鉴于 1677 | 针对 1678 | 长期以来 1679 | 长此下去 1680 | 长线 1681 | 长话短说 1682 | 问题 1683 | 间或 1684 | 防止 1685 | 阿 1686 | 附近 1687 | 陈年 1688 | 限制 1689 | 陡然 1690 | 除 1691 | 除了 1692 | 除却 1693 | 除去 1694 | 除外 1695 | 除开 1696 | 除此 1697 | 除此之外 1698 | 除此以外 1699 | 除此而外 1700 | 除非 1701 | 随 1702 | 随后 1703 | 随时 1704 | 随着 1705 | 随著 1706 | 隔夜 1707 | 隔日 1708 | 难得 1709 | 难怪 1710 | 难说 1711 | 难道 1712 | 难道说 1713 | 集中 1714 | 零 1715 | 需要 1716 | 非但 1717 | 非常 1718 | 非徒 1719 | 非得 1720 | 非特 1721 | 非独 1722 | 靠 1723 | 顶多 1724 | 顷 1725 | 顷刻 1726 | 顷刻之间 1727 | 顷刻间 1728 | 顺 1729 | 顺着 1730 | 顿时 1731 | 颇 1732 | 风雨无阻 1733 | 饱 1734 | 首先 1735 | 马上 1736 | 高低 1737 | 高兴 1738 | 默然 1739 | 默默地 1740 | 齐 1741 | ︿ 1742 | ! 1743 | # 1744 | $ 1745 | % 1746 | & 1747 | ' 1748 | ( 1749 | ) 1750 | )÷(1- 1751 | )、 1752 | * 1753 | + 1754 | +ξ 1755 | ++ 1756 | , 1757 | ,也 1758 | - 1759 | -β 1760 | -- 1761 | -[*]- 1762 | . 1763 | / 1764 | 0 1765 | 0:2 1766 | 1 1767 | 1. 1768 | 12% 1769 | 2 1770 | 2.3% 1771 | 3 1772 | 4 1773 | 5 1774 | 5:0 1775 | 6 1776 | 7 1777 | 8 1778 | 9 1779 | : 1780 | ; 1781 | < 1782 | <± 1783 | <Δ 1784 | <λ 1785 | <φ 1786 | << 1787 | = 1788 | =″ 1789 | =☆ 1790 | =( 1791 | =- 1792 | =[ 1793 | ={ 1794 | > 1795 | >λ 1796 | ? 1797 | @ 1798 | A 1799 | LI 1800 | R.L. 1801 | ZXFITL 1802 | [ 1803 | [①①] 1804 | [①②] 1805 | [①③] 1806 | [①④] 1807 | [①⑤] 1808 | [①⑥] 1809 | [①⑦] 1810 | [①⑧] 1811 | [①⑨] 1812 | [①A] 1813 | [①B] 1814 | [①C] 1815 | [①D] 1816 | [①E] 1817 | [①] 1818 | [①a] 1819 | [①c] 1820 | [①d] 1821 | [①e] 1822 | [①f] 1823 | [①g] 1824 | [①h] 1825 | [①i] 1826 | [①o] 1827 | [② 1828 | [②①] 1829 | [②②] 1830 | [②③] 1831 | [②④ 1832 | [②⑤] 1833 | [②⑥] 1834 | [②⑦] 1835 | [②⑧] 1836 | [②⑩] 1837 | [②B] 1838 | [②G] 1839 | [②] 1840 | [②a] 1841 | [②b] 1842 | [②c] 1843 | [②d] 1844 | [②e] 1845 | [②f] 1846 | [②g] 1847 | [②h] 1848 | [②i] 1849 | [②j] 1850 | [③①] 1851 | [③⑩] 1852 | [③F] 1853 | [③] 1854 | [③a] 1855 | [③b] 1856 | [③c] 1857 | [③d] 1858 | [③e] 1859 | [③g] 1860 | [③h] 1861 | [④] 1862 | [④a] 1863 | [④b] 1864 | [④c] 1865 | [④d] 1866 | [④e] 1867 | [⑤] 1868 | [⑤]] 1869 | [⑤a] 1870 | [⑤b] 1871 | [⑤d] 1872 | [⑤e] 1873 | [⑤f] 1874 | [⑥] 1875 | [⑦] 1876 | [⑧] 1877 | [⑨] 1878 | [⑩] 1879 | [*] 1880 | [- 1881 | [] 1882 | ] 1883 | ]∧′=[ 1884 | ][ 1885 | _ 1886 | a] 1887 | b] 1888 | c] 1889 | e] 1890 | f] 1891 | ng昉 1892 | { 1893 | {- 1894 | | 1895 | } 1896 | }> 1897 | ~ 1898 | ~± 1899 | ~+ 1900 | ¥ 1901 | 玩 1902 | 1903 | -------------------------------------------------------------------------------- /site/mybzz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/__init__.py -------------------------------------------------------------------------------- /site/mybzz/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /site/mybzz/crawler/360Crawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from site.mybzz.util import DbUtil 4 | from site.mybzz.util import BsUtil 5 | from site.mybzz.util import DateUtil 6 | 7 | conn, cur = DbUtil.getConn() 8 | 9 | 10 | def getData(name, id, score, totalDownload): 11 | commentUrl = "http://comment.mobilem.360.cn/comment/getComments?baike=%s&start=%s&count=%s" 12 | start, count = 0, 50 13 | result = BsUtil.praseJson(commentUrl % (id, start, 1)) 14 | totalComCount = result['data']['total'] 15 | 16 | print('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 17 | 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( 18 | name, '360', totalComCount, (score * 10) / 2, 19 | totalDownload, DateUtil.currentDate())) 20 | cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 21 | 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( 22 | name, '360', totalComCount, (score * 10) / 2, 23 | totalDownload, DateUtil.currentDate())) 24 | game_id = cur.lastrowid 25 | 26 | while (True): 27 | try : 28 | result = BsUtil.praseJson(commentUrl % (id, start, count)) 29 | except: 30 | print(commentUrl % (id, start, count)) 31 | if not result['data']['messages']: 32 | break 33 | for comment in result['data']['messages']: 34 | # print(comment['username'], comment['content'], comment['score'], comment['create_time']) 35 | print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 36 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 37 | game_id, comment['content'].replace('\n', ''), comment['create_time'], 38 | comment['username'], comment['score'])) 39 | try: 40 | cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 41 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 42 | game_id, comment['content'].replace('\n', '').replace('\"','\''), comment['create_time'], 43 | comment['username'], comment['score'])) 44 | except: 45 | print(sys.exc_info()[0], ":", sys.exc_info()[1]) 46 | pass 47 | start += 50 48 | 49 | 50 | def getTop50(): 51 | result = BsUtil.praseJson("http://openbox.mobilem.360.cn/app/rank?from=game&type=download&startCount=1") 52 | # print(result) 53 | for app in result['data'][4:]: 54 | print(app['name'], app['id'], app['rating'], app['download_times']) 55 | getData(app['name'], app['id'], float(app['rating']), app['download_times']) 56 | conn.commit() 57 | 58 | if __name__ == '__main__': 59 | getTop50() 60 | DbUtil.close(conn, cur) 61 | -------------------------------------------------------------------------------- /site/mybzz/crawler/AppleCrawler.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib import request 3 | import zlib 4 | from bs4 import BeautifulSoup 5 | import re 6 | from site.mybzz.util import DbUtil 7 | from site.mybzz.util import BsUtil 8 | from site.mybzz.util import DateUtil 9 | 10 | conn, cur = DbUtil.getConn() 11 | 12 | 13 | def getComment(url, game_id): 14 | req = request.Request(url) 15 | req.add_header("User-Agent", 16 | "iTunes/11.0 (Windows; Microsoft Windows 7 Business Edition Service Pack 1 " 17 | "(Build 7601)) AppleWebKit/536.27.1") 18 | result = request.urlopen(req, timeout=30) 19 | json_result = json.loads(result.read().decode()) 20 | for comment in json_result['userReviewList']: 21 | try: 22 | print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 23 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 24 | game_id, comment['body'], 25 | comment['date'].replace("T", " ").replace("Z", ""), 26 | comment['name'], comment['rating'] * 10)) 27 | cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 28 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 29 | game_id, comment['body'], 30 | comment['date'].replace("T", " ").replace("Z", ""), 31 | comment['name'], comment['rating'] * 10)) 32 | except: 33 | pass 34 | 35 | 36 | def getData(id, totalComCount, game_id): 37 | start = 0 38 | while (totalComCount > 0): 39 | try: 40 | if totalComCount > 500: 41 | url = "https://itunes.apple.com/WebObjects/MZStore.woa/wa/userReviewsRow?cc=cn&id=%s&displayable-kind=11&" \ 42 | "startIndex=%s&endIndex=%s&sort=4&appVersion=all" % ( 43 | id, start, (start + 500)) 44 | else: 45 | url = "https://itunes.apple.com/WebObjects/MZStore.woa/wa/userReviewsRow?cc=cn&id=%s&displayable-kind=11" \ 46 | "&startIndex=%s&endIndex=%s&sort=4&appVersion=all" % ( 47 | id, start, (start + totalComCount)) 48 | 49 | print(url) 50 | totalComCount = totalComCount - 500 51 | start = start + 500 52 | getComment(url, game_id) 53 | conn.commit() 54 | 55 | except: 56 | print("comment error") 57 | getComment(url, game_id) 58 | pass 59 | 60 | 61 | def getTop(all): 62 | url = "https://itunes.apple.com/cn/rss/topgrossingipadapplications/limit=50/json" 63 | 64 | result = BsUtil.praseJson(url) 65 | for app in result['feed']['entry']: 66 | if app['category']['attributes']['term'] == 'Games': 67 | try: 68 | detail = all['storePlatformData']['lockup-room']['results'][app['id']['attributes']['im:id']] 69 | 70 | print( 71 | 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 72 | 'VALUES ("%s", "%s", "%s", %s, "%s", "%s");' % ( 73 | app['im:name']['label'], 'Apple Store', detail['userRating']['ratingCount'], 74 | (detail['userRating']['value']), 75 | 0, DateUtil.currentDate())) 76 | # cur.execute( 77 | # 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 78 | # 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % (app['im:name']['label'], 'Apple Store', 79 | # detail['userRating']['ratingCount'], 80 | # int(detail['userRating']['value']) * 10, 81 | # 0, DateUtil.currentDate())) 82 | # getData(app['id']['attributes']['im:id'], detail['userRating']['ratingCount'], cur.lastrowid) 83 | except: 84 | # print(app['id']['attributes']['im:id'], '............') 85 | # print(sys.exc_info()[0], ":", sys.exc_info()[1]) 86 | pass 87 | 88 | 89 | def getAllDetail(): 90 | url = "https://itunes.apple.com/WebObjects/MZStore.woa/wa/viewTop?id=29099&popId=38&genreId=36" 91 | req = request.Request(url) 92 | req.add_header("User-Agent", 93 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36") 94 | req.add_header("Host", "itunes.apple.com") 95 | req.add_header("Connection", "keep-alive") 96 | req.add_header("Cache-Control", "no-cache") 97 | req.add_header("X-Apple-Store-Front", "143465-19,32 ab:pNOGxia1") 98 | req.add_header("Accept-Language", "zh-cn, zh;q=0.75, en-us;q=0.50, en;q=0.25") 99 | req.add_header("X-Apple-I-MD-M", "sKfpwVaN+aYhvpzdR1eEp5E1nN7xuK5Q6eEl2fcooczbWhwrTp3PTfm5AwiMZi0hucRNdGaFRU3RX+Yx") 100 | req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 101 | req.add_header("X-Apple-I-MD-RINFO", "17106176") 102 | req.add_header("X-Apple-Tz", "28800") 103 | req.add_header("If-Modified-Since", "Fri, 03 Jun 2016 11:23:38 GMT") 104 | req.add_header("X-Apple-I-MD", "AAAABQAAABCRcn9HLXhoSaRR0WXm+1cmAAAAAg==") 105 | req.add_header("Accept-Encoding", "gzip, deflate, sdch") 106 | 107 | resp = request.urlopen(req) 108 | 109 | data = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS).decode() 110 | 111 | soup = BeautifulSoup(data, 'html.parser') 112 | data = soup.find(text=re.compile('its.serverData')).replace('its.serverData=', '') 113 | 114 | all = json.loads(data) 115 | 116 | return all 117 | 118 | 119 | if __name__ == '__main__': 120 | all = getAllDetail() 121 | getTop(all) 122 | DbUtil.close(conn, cur) 123 | -------------------------------------------------------------------------------- /site/mybzz/crawler/BaiDuCrawler.py: -------------------------------------------------------------------------------- 1 | from site.mybzz.util import DbUtil 2 | from site.mybzz.util import BsUtil 3 | from site.mybzz.util import DateUtil 4 | 5 | conn, cur = DbUtil.getConn() 6 | 7 | 8 | def getData(groupId, game_id): 9 | try: 10 | url = "http://m.baidu.com/appsrv?action=getcommentlist&native_api=1&groupid=%s&start=0&count=1" % ( 11 | groupId) 12 | result = BsUtil.praseGzipJson(url) 13 | totalComCount = result['total_count'] 14 | print("总评论数:", totalComCount) 15 | url = "http://m.baidu.com/appsrv?action=getcommentlist&native_api=1&groupid=%s&start=0&count=%s" % ( 16 | groupId, totalComCount) 17 | result = BsUtil.praseGzipJson(url) 18 | 19 | for comment in result['data']: 20 | print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 21 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 22 | game_id, comment['content'].replace('\n', ''), 23 | DateUtil.longToStrTime(int(comment['create_time'])), 24 | comment['user_name'], comment['score'])) 25 | # cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 26 | # 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 27 | # game_id, comment['content'].replace('\n', ''), 28 | # DateUtil.longToStrTime(int(comment['create_time'])), 29 | # comment['user_name'], comment['score'])) 30 | except: 31 | pass 32 | 33 | 34 | def getTop15(): 35 | json_result = BsUtil.praseGzipJson( 36 | 'http://m.baidu.com/appsrv?action=ranklist&native_api=1&pu=ctv%401%2Ccfrom%401000561u%2Ccua%40gu2ki4uq-' 37 | 'igBNE6lI5me6NNy2I_UCvhlSdNqA%2Ccuid%400u-Yu0PYH8jVavuO_a-YagiSS8lvuvu9_a2L80ufvi6kuviJlavefYamv8_6uvtz' 38 | '_a2etxNNB%2Ccut%40rIviC_C0vC_7uLP7NJGCjxNIB%2Cosname%40baiduappsearch&name=game') 39 | 40 | for app in json_result['result']['data']: 41 | appInfo = app['itemdata'] 42 | print('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 43 | 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( 44 | appInfo['sname'], 'baidu', appInfo['commentsnum'][:-2], 10 * round(int(appInfo['score']) / 20, 1), 45 | appInfo['display_download'], DateUtil.currentDate())) 46 | # cur.execute( 47 | # 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 48 | # 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( 49 | # appInfo['sname'], 'baidu', appInfo['commentsnum'][:-2], 10 * round(int(appInfo['score']) / 20, 1), 50 | # appInfo['display_download'], DateUtil.currentDate())) 51 | # game_id = cur.lastrowid 52 | getData(appInfo['groupid'], 0) 53 | 54 | detailUrl = "http://m.baidu.com/appsrv?action=detail&native_api=1&docid=%s" % appInfo['docid'] 55 | detail = BsUtil.praseGzipJson(detailUrl) 56 | 57 | for version in detail['result']['data']['app_moreversion']: 58 | getData(version['content'][0]['groupid'], 0) 59 | print('------------------------------------------------------') 60 | 61 | 62 | if __name__ == '__main__': 63 | getTop15() 64 | conn.commit() 65 | DbUtil.close(conn, cur) 66 | -------------------------------------------------------------------------------- /site/mybzz/crawler/MZCrawler.py: -------------------------------------------------------------------------------- 1 | import html 2 | from site.mybzz.util import DbUtil 3 | from site.mybzz.util import BsUtil 4 | from site.mybzz.util import DateUtil 5 | 6 | 7 | def getData(id,package_name): 8 | 9 | total = BsUtil.praseJson('http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=1' % id) 10 | conn,cur = DbUtil.getConn() 11 | 12 | totalComCount = total['value']['totalCount'] 13 | # 获取总下载量和评分 14 | soup = BsUtil.praseHtml('http://app.flyme.cn/games/public/detail?package_name=%s' % package_name) 15 | 16 | totalScore = soup.find('div', class_="star_bg").attrs['data-num'] 17 | totalDownload = soup.find(text="下      载:").parent.next_sibling.next_sibling.string 18 | #获取游戏名 19 | for child in soup.find('div', class_="detail_top").children: 20 | if (child.name == 'h3'): 21 | game_name = child.string 22 | 23 | cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 24 | 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' %(game_name,'meizu',totalComCount, 25 | totalScore,totalDownload,DateUtil.currentDate())) 26 | game_id = cur.lastrowid 27 | #获取所有评论内容 28 | value = BsUtil.praseJson('http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=%s'% (id,totalComCount)) 29 | 30 | for com in value['value']['list']: 31 | comment = html.unescape(com['comment']).replace("\"","'") 32 | time = com['create_time'] 33 | author = html.unescape(com['user_name']).replace("\"","'") 34 | score = com['star'] 35 | 36 | try: 37 | cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 38 | 'VALUES ("%s", "%s", "%s", "%s", %d);' % (game_id,comment,time,author,score)) 39 | except: 40 | pass 41 | 42 | conn.commit() 43 | DbUtil.close(conn,cur) 44 | 45 | 46 | def getTop50(): 47 | result = BsUtil.praseJson('http://api-game.meizu.com/games/public/top/layout?start=0&max=50') 48 | for game in result['value']['blocks'][0]['data'][3:4]: 49 | print('游戏名:%s,id:%s,包名:%s' % (game['name'],game['id'],game['package_name'])) 50 | try: 51 | getData(game['id'],game['package_name']) 52 | except: 53 | pass 54 | 55 | 56 | if __name__ == '__main__': 57 | getTop50() -------------------------------------------------------------------------------- /site/mybzz/crawler/TencentCrawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from site.mybzz.util import DbUtil 4 | from site.mybzz.util import BsUtil 5 | from site.mybzz.util import DateUtil 6 | 7 | conn, cur = DbUtil.getConn() 8 | 9 | 10 | def getData(name, downloadCount, score, packageName): 11 | contextData = '' 12 | url = "http://sj.qq.com/myapp/app/comment.htm?apkName=%s&contextData=%s" 13 | 14 | totalComCount = 0 15 | while totalComCount == 0: 16 | try: 17 | result = BsUtil.praseJson(url % (packageName, contextData)) 18 | totalComCount = result['obj']['total'] 19 | except: 20 | pass 21 | 22 | print( 23 | 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 24 | 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( 25 | name, 'qq', totalComCount, score, downloadCount, DateUtil.currentDate())) 26 | cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 27 | 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( 28 | name, 'qq', totalComCount, score, downloadCount, DateUtil.currentDate())) 29 | game_id = cur.lastrowid 30 | while (True): 31 | try: 32 | result = BsUtil.praseJson(url % (packageName, contextData)) 33 | if not result['success']: 34 | continue 35 | if result['obj']['hasNext'] != 1: 36 | break 37 | 38 | contextData = result['obj']['contextData'] 39 | 40 | for comment in result['obj']['commentDetails']: 41 | print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 42 | 'VALUES ("%s", "%s", "%s", "%s", %d);' % ( 43 | game_id, comment['content'].replace('\r', '').replace(' ', ''), 44 | DateUtil.longToStrTime(int(comment['createdTime'])), 45 | comment['nickName'], int(comment['score']) * 10)) 46 | cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 47 | 'VALUES ("%s", "%s", "%s", "%s", %s);' % ( 48 | game_id, comment['content'].replace('\r', '').replace(" ", ""), 49 | DateUtil.longToStrTime(int(comment['createdTime'])), 50 | comment['nickName'], int(comment['score']) * 10)) 51 | except: 52 | conn.commit() 53 | print(result) 54 | print(sys.exc_info()[0], ":", sys.exc_info()[1]) 55 | 56 | 57 | def getTop(): 58 | url = "http://pngweb.3g.qq.com/getSubRankList?sortType=22&categoryId=-2&pageSize=20&startIndex=0&needCateList=0&phoneGuid=891204461307686912&phoneImei=862095025228963&qua=TMAF_652_F_2152%2F062152%26NA%2F062152%2F6522130_2152%265.1_22_2_0_0%26120_72_14%26Meizu_MX4_Meizu_meizumx4%261000047%262152%26V3&androidId=dba20155a97326c&macAdress=&imsi=460011431632413&wifiBssid=" 59 | result = BsUtil.praseQQ(url) 60 | for app in result['appList']: 61 | print(app['appName'], app['apkId'], '下载数:', app['appDownCount'], '评分:', float(app['score']) * 10, 62 | app['packageName']) 63 | getData(app['appName'], app['appDownCount'], float(app['score']) * 10, app['packageName']) 64 | pass 65 | 66 | 67 | if __name__ == '__main__': 68 | getTop() 69 | # getData('', 0, 0, packageName='com.qqgame.hlddz') 70 | DbUtil.close(conn, cur) 71 | -------------------------------------------------------------------------------- /site/mybzz/crawler/WdjCrawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from site.mybzz.util import DbUtil 4 | from site.mybzz.util import BsUtil 5 | from site.mybzz.util import DateUtil 6 | 7 | 8 | def getData(name, id, commentCount, totalDownload, packageName): 9 | 10 | pass 11 | 12 | 13 | def getTop50(): 14 | result = BsUtil.praseJson( 15 | "http://apps.wandoujia.com/api/v1/apps?type=weeklytopgame&max=50&start=0&opt_fields=likesCount,title,packageName,installedCountStr,id,commentsCount") 16 | for app in result: 17 | print('游戏名:%s,id:%s,评论数:%s,下载量:%s,包名:%s' % ( 18 | app['title'], app['id'], app['commentsCount'], app['installedCountStr'], app['packageName'])) 19 | # print(app) 20 | if '万' in app['installedCountStr']: 21 | totalDownload = int(float(app['installedCountStr'][:-2]) * 10000) 22 | else: 23 | totalDownload = int(float(app['installedCountStr'][:-1])) 24 | 25 | getData(app['title'], app['id'], app['commentsCount'], totalDownload, app['packageName']) 26 | 27 | 28 | if __name__ == '__main__': 29 | getTop50() 30 | -------------------------------------------------------------------------------- /site/mybzz/crawler/XiaoMiCrawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from site.mybzz.util import DbUtil 4 | from site.mybzz.util import BsUtil 5 | from site.mybzz.util import DateUtil 6 | 7 | 8 | def getTop30(): 9 | page = 0 10 | result = BsUtil.praseJson( 11 | 'http://market.xiaomi.com/apm/toplist/15?clientId=2bb48bb54747e03a6ab667ab7b51050a&co=CN&la=zh&os=1461822601&page=%d&sdk=22&stamp=0' % page) 12 | # print(result) 13 | for game in result['listApp']: 14 | print('游戏名:%s,id:%s,总评分:%s' % (game['displayName'], game['id'], game['ratingScore'])) 15 | try: 16 | getData(game['id'], game['displayName'], game['ratingScore']) 17 | except: 18 | print(sys.exc_info()[0], ":", sys.exc_info()[1]) 19 | pass 20 | 21 | 22 | def getData(id, name, totalScore): 23 | page = 0 24 | hasMore = True 25 | 26 | # 插入游戏 27 | conn, cur = DbUtil.getConn() 28 | 29 | result = BsUtil.praseJson('http://market.xiaomi.com/apm/comment/list/%s?' 30 | 'clientId=2bb48bb54747e03a6ab667ab7b51050a&co=CN' 31 | '&la=zh&os=1461822601&page=%s&sdk=22' % (id, page)) 32 | totalComCount = result['pointCount'] 33 | 34 | print('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 35 | 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( 36 | name, 'xiaomi', totalComCount, totalScore * 10, 0, DateUtil.currentDate())) 37 | cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 38 | 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( 39 | name, 'xiaomi', totalComCount, totalScore * 10, 0, DateUtil.currentDate())) 40 | game_id = cur.lastrowid 41 | # game_id = 0 42 | while (hasMore): 43 | result = BsUtil.praseJson('http://market.xiaomi.com/apm/comment/list/%s?' 44 | 'clientId=2bb48bb54747e03a6ab667ab7b51050a&co=CN' 45 | '&la=zh&os=1461822601&page=%s&sdk=22' % (id, page)) 46 | # print(result) 47 | for comment in result['comments']: 48 | content = comment['commentValue'].replace("\"", "'").replace(" ", "") 49 | score = comment['pointValue'] 50 | time = comment['updateTime'] 51 | author = comment['nickname'].replace("\"", "'") 52 | # 插入评论 53 | try: 54 | print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 55 | 'VALUES ("%s", "%s", "%s", "%s", %d);' % ( 56 | game_id, content, DateUtil.longToStrTime(time / 1000), author, score)) 57 | cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 58 | 'VALUES ("%s", "%s", "%s", "%s", %d);' % ( 59 | game_id, content, DateUtil.longToStrTime(time / 1000), author, score)) 60 | except: 61 | print(sys.exc_info()[0], ":", sys.exc_info()[1]) 62 | pass 63 | page += 1 64 | hasMore = result['hasMore'] 65 | 66 | conn.commit() 67 | DbUtil.close(conn, cur) 68 | 69 | 70 | if __name__ == '__main__': 71 | getTop30() 72 | -------------------------------------------------------------------------------- /site/mybzz/domain/MzComment.py: -------------------------------------------------------------------------------- 1 | class MzComment(object): 2 | pass 3 | -------------------------------------------------------------------------------- /site/mybzz/domain/__pycache__/MzComment.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/domain/__pycache__/MzComment.cpython-34.pyc -------------------------------------------------------------------------------- /site/mybzz/keyword/CreateWordCloud.py: -------------------------------------------------------------------------------- 1 | import random 2 | from operator import itemgetter 3 | import jieba 4 | import jieba.analyse 5 | from pytagcloud import make_tags, create_tag_image 6 | from pytagcloud.colors import COLOR_SCHEMES 7 | from site.mybzz.util import DbUtil 8 | 9 | stop = [] 10 | 11 | def plot(game_name, game_id): 12 | dict = {} 13 | comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) 14 | for comment in comments: 15 | 16 | result = jieba.analyse.extract_tags(comment[2], topK=3) 17 | 18 | for word in result: 19 | if len(word) < 2: 20 | continue 21 | elif word in stop: 22 | continue 23 | 24 | if word not in dict: 25 | dict[word] = 1 26 | else: 27 | dict[word] += 1 28 | 29 | print(dict) 30 | 31 | swd = sorted(dict.items(), key=itemgetter(1), reverse=True) 32 | swd = swd[1:50] 33 | tags = make_tags(swd, 34 | minsize=30, 35 | maxsize=120, 36 | colors=random.choice(list(COLOR_SCHEMES.values()))) 37 | 38 | create_tag_image(tags, 39 | 'c:/%s.png' % game_name, 40 | background=(0, 0, 0, 255), 41 | size=(900, 600), 42 | fontname='SimHei') 43 | 44 | print('having save file to dick') 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | f = open("../StopWords.txt", encoding="utf-8") 50 | jieba.load_userdict("c:/dict.txt") 51 | 52 | while True: 53 | line = f.readline().replace("\n", '') 54 | 55 | if not line: 56 | break 57 | stop.append(line) 58 | 59 | games = DbUtil.getAllResult("select game_id,games.game_name from `comment` join games on game_id = games.id GROUP BY game_id ORDER BY count(game_id) desc limit 50") 60 | 61 | l =[] 62 | for game in games: 63 | if game[1] not in l: 64 | plot(game[1], game[0]) 65 | l.append(game[1]) 66 | -------------------------------------------------------------------------------- /site/mybzz/keyword/InsertKeyWord.py: -------------------------------------------------------------------------------- 1 | import random 2 | from operator import itemgetter 3 | import jieba 4 | import jieba.analyse 5 | from pytagcloud import make_tags, create_tag_image 6 | from pytagcloud.colors import COLOR_SCHEMES 7 | 8 | from site.mybzz.util import DbUtil 9 | 10 | stop = [] 11 | conn, cur = DbUtil.getConn() 12 | 13 | def plot(game_name, game_id): 14 | dict = {} 15 | comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) 16 | for comment in comments: 17 | 18 | result = jieba.analyse.extract_tags(comment[2], topK=3) 19 | 20 | for word in result: 21 | if len(word) < 2: 22 | continue 23 | elif word in stop: 24 | continue 25 | 26 | if word not in dict: 27 | dict[word] = 1 28 | else: 29 | dict[word] += 1 30 | 31 | swd = sorted(dict.items(), key=itemgetter(1), reverse=True) 32 | swd = swd[1:50] 33 | 34 | tags = make_tags(swd, 35 | minsize=30, 36 | maxsize=100, 37 | colors=random.choice(list(COLOR_SCHEMES.values()))) 38 | 39 | 40 | create_tag_image(tags, 41 | 'C:/Users/Administrator/Desktop/%s_%s.png' % (game_name, game_id), 42 | background=(0, 0, 0, 255), 43 | size=(900, 600), 44 | fontname='SimHei') 45 | print('create file ---%s' % game_name) 46 | 47 | # dict = {} 48 | # 49 | # for (k, v) in swd: 50 | # dict[k] = v 51 | # print('INSERT INTO keyword (game_id, keyword) VALUES (%s, "%s"' % (game_id, str(dict))) 52 | # cur.execute('INSERT INTO keyword (game_id, keyword) VALUES (%s, "%s")' % (game_id, str(dict))) 53 | # conn.commit() 54 | 55 | word = DbUtil.getOneResult('select keyword from keyword limit 1') 56 | print(eval(word[0])) 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | f = open("../StopWords.txt", encoding="utf-8") 62 | jieba.load_userdict("c:/dict.txt") 63 | 64 | while True: 65 | line = f.readline().replace("\n", '') 66 | 67 | if not line: 68 | break 69 | stop.append(line) 70 | 71 | # games = DbUtil.getAllResult( 72 | # "select game_id,games.game_name from `comment` join games on game_id = games.id GROUP BY game_id ORDER BY count(game_id) desc") 73 | # 74 | # for game in games: 75 | # plot(game[1], game[0]) 76 | plot('皇室战争', 189) -------------------------------------------------------------------------------- /site/mybzz/neg_review.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/neg_review.pkl -------------------------------------------------------------------------------- /site/mybzz/pos_review.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/pos_review.pkl -------------------------------------------------------------------------------- /site/mybzz/sentiment/Feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import load_files 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.naive_bayes import MultinomialNB 8 | 9 | from site.mybzz.sentiment import feature_selection 10 | 11 | 12 | def text_classifly_twang(dataset_dir_name, fs_method, fs_num): 13 | print('Loading dataset, 80% for training, 20% for testing...') 14 | movie_reviews = load_files(dataset_dir_name) 15 | doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split( 16 | movie_reviews.data, movie_reviews.target, test_size=0.2, random_state=0) 17 | 18 | print('Feature selection...') 19 | print('fs method:' + fs_method, 'fs num:' + str(fs_num)) 20 | 21 | vectorizer = CountVectorizer(binary=True) 22 | word_tokenizer = vectorizer.build_tokenizer() 23 | doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] 24 | term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] 25 | 26 | print 27 | 'Building VSM model...' 28 | term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) 29 | vectorizer.fixed_vocabulary = True 30 | vectorizer.vocabulary_ = term_dict 31 | doc_train_vec = vectorizer.fit_transform(doc_str_list_train) 32 | doc_test_vec = vectorizer.transform(doc_str_list_test) 33 | 34 | clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) # 调用MultinomialNB分类器 35 | doc_test_predicted = clf.predict(doc_test_vec) 36 | 37 | acc = np.mean(doc_test_predicted == doc_class_list_test) 38 | print 39 | 'Accuracy: ', acc 40 | 41 | return acc 42 | 43 | 44 | if __name__ == '__main__': 45 | dataset_dir_name = "E:\MyProject\python\comment\site\mybzz\sentiment" 46 | fs_method_list = ['IG', 'MI', 'WLLR'] 47 | fs_num_list = range(25000, 35000, 1000) 48 | acc_dict = {} 49 | 50 | for fs_method in fs_method_list: 51 | acc_list = [] 52 | for fs_num in fs_num_list: 53 | acc = text_classifly_twang(dataset_dir_name, fs_method, fs_num) 54 | acc_list.append(acc) 55 | acc_dict[fs_method] = acc_list 56 | print 57 | 'fs method:', acc_dict[fs_method] 58 | 59 | for fs_method in fs_method_list: 60 | plt.plot(fs_num_list, acc_dict[fs_method], '--^', label=fs_method) 61 | plt.title('feature selection') 62 | plt.xlabel('fs num') 63 | plt.ylabel('accuracy') 64 | plt.ylim((0.82, 0.86)) 65 | 66 | plt.legend(loc='upper left', numpoints=1) 67 | plt.show() -------------------------------------------------------------------------------- /site/mybzz/sentiment/Vec.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | 3 | from gensim.models import word2vec 4 | import jieba 5 | from gensim.models.word2vec import LineSentence 6 | from site.mybzz.util import DbUtil 7 | 8 | stop = [] 9 | 10 | def train(fileName, modelName): 11 | model = word2vec.Word2Vec(['a','b','c'],size=200,window=5,min_count=5,workers=multiprocessing.cpu_count()) 12 | model.save(modelName) 13 | return model 14 | 15 | 16 | def cut(): 17 | comments = DbUtil.getAllResult("select * from comment limit 300000") 18 | 19 | 20 | file = open("test1.txt", "w",encoding="utf-8") 21 | for comment in comments: 22 | list = [] 23 | result = jieba.cut(comment[2]) 24 | for word in result: 25 | if word not in stop and word != ' ': 26 | list.append(word) 27 | 28 | if list: 29 | file.write(" ".join(list)) 30 | file.write("\n") 31 | file.close() 32 | pass 33 | 34 | def getStop(): 35 | 36 | f = open("../StopWords.txt", encoding="utf-8") 37 | jieba.load_userdict("c:/dict.txt") 38 | 39 | while True: 40 | line = f.readline().replace("\n", '') 41 | 42 | if not line: 43 | break 44 | stop.append(line) 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | getStop() 50 | 51 | cut() 52 | 53 | # model = train("test.txt","model") 54 | # 55 | # for w in model.most_similar(u'魅族'): 56 | # print(w[0], w[1]) 57 | -------------------------------------------------------------------------------- /site/mybzz/sentiment/feature_selection.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | def get_term_dict(doc_terms_list): 6 | term_set_dict = {} 7 | for doc_terms in doc_terms_list: 8 | for term in doc_terms: 9 | term_set_dict[term] = 1 10 | term_set_list = sorted(term_set_dict.keys()) # term set 排序后,按照索引做出字典 11 | term_set_dict = dict(zip(term_set_list, range(len(term_set_list)))) 12 | return term_set_dict 13 | 14 | 15 | def get_class_dict(doc_class_list): 16 | class_set = sorted(list(set(doc_class_list))) 17 | class_dict = dict(zip(class_set, range(len(class_set)))) 18 | return class_dict 19 | 20 | 21 | def stats_term_df(doc_terms_list, term_dict): 22 | term_df_dict = {}.fromkeys(term_dict.keys(), 0) 23 | for term in term_dict: 24 | for doc_terms in doc_terms_list: 25 | if term in doc_terms_list: 26 | term_df_dict[term] += 1 27 | return term_df_dict 28 | 29 | 30 | def stats_class_df(doc_class_list, class_dict): 31 | class_df_list = [0] * len(class_dict) 32 | for doc_class in doc_class_list: 33 | class_df_list[class_dict[doc_class]] += 1 34 | return class_df_list 35 | 36 | 37 | def stats_term_class_df(doc_terms_list, doc_class_list, term_dict, class_dict): 38 | term_class_df_mat = np.zeros((len(term_dict), len(class_dict)), np.float32) 39 | for k in range(len(doc_class_list)): 40 | class_index = class_dict[doc_class_list[k]] 41 | doc_terms = doc_terms_list[k] 42 | for term in set(doc_terms): 43 | term_index = term_dict[term] 44 | term_class_df_mat[term_index][class_index] += 1 45 | return term_class_df_mat 46 | 47 | 48 | def feature_selection_mi(class_df_list, term_set, term_class_df_mat): 49 | A = term_class_df_mat 50 | B = np.array([(sum(x) - x).tolist() for x in A]) 51 | C = np.tile(class_df_list, (A.shape[0], 1)) - A 52 | N = sum(class_df_list) 53 | class_set_size = len(class_df_list) 54 | 55 | term_score_mat = np.log(((A + 1.0) * N) / ((A + C) * (A + B + class_set_size))) 56 | term_score_max_list = [max(x) for x in term_score_mat] 57 | term_score_array = np.array(term_score_max_list) 58 | sorted_term_score_index = term_score_array.argsort()[:: -1] 59 | term_set_fs = [term_set[index] for index in sorted_term_score_index] 60 | 61 | return term_set_fs 62 | 63 | 64 | def feature_selection_ig(class_df_list, term_set, term_class_df_mat): 65 | A = term_class_df_mat 66 | B = np.array([(sum(x) - x).tolist() for x in A]) 67 | C = np.tile(class_df_list, (A.shape[0], 1)) - A 68 | N = sum(class_df_list) 69 | D = N - A - B - C 70 | term_df_array = np.sum(A, axis=1) 71 | class_set_size = len(class_df_list) 72 | 73 | p_t = term_df_array / N 74 | p_not_t = 1 - p_t 75 | p_c_t_mat = (A + 1) / (A + B + class_set_size) 76 | p_c_not_t_mat = (C + 1) / (C + D + class_set_size) 77 | p_c_t = np.sum(p_c_t_mat * np.log(p_c_t_mat), axis=1) 78 | p_c_not_t = np.sum(p_c_not_t_mat * np.log(p_c_not_t_mat), axis=1) 79 | 80 | term_score_array = p_t * p_c_t + p_not_t * p_c_not_t 81 | sorted_term_score_index = term_score_array.argsort()[:: -1] 82 | term_set_fs = [term_set[index] for index in sorted_term_score_index] 83 | 84 | return term_set_fs 85 | 86 | 87 | def feature_selection_wllr(class_df_list, term_set, term_class_df_mat): 88 | A = term_class_df_mat 89 | B = np.array([(sum(x) - x).tolist() for x in A]) 90 | C_Total = np.tile(class_df_list, (A.shape[0], 1)) 91 | N = sum(class_df_list) 92 | C_Total_Not = N - C_Total 93 | term_set_size = len(term_set) 94 | 95 | p_t_c = (A + 1E-6) / (C_Total + 1E-6 * term_set_size) 96 | p_t_not_c = (B + 1E-6) / (C_Total_Not + 1E-6 * term_set_size) 97 | term_score_mat = p_t_c * np.log(p_t_c / p_t_not_c) 98 | 99 | term_score_max_list = [max(x) for x in term_score_mat] 100 | term_score_array = np.array(term_score_max_list) 101 | sorted_term_score_index = term_score_array.argsort()[:: -1] 102 | term_set_fs = [term_set[index] for index in sorted_term_score_index] 103 | 104 | print(term_set_fs[:10]) 105 | return term_set_fs 106 | 107 | 108 | def feature_selection(doc_terms_list, doc_class_list, fs_method): 109 | class_dict = get_class_dict(doc_class_list) 110 | term_dict = get_term_dict(doc_terms_list) 111 | class_df_list = stats_class_df(doc_class_list, class_dict) 112 | term_class_df_mat = stats_term_class_df(doc_terms_list, doc_class_list, term_dict, class_dict) 113 | term_set = [term[0] for term in sorted(term_dict.items(), key=lambda x: x[1])] 114 | term_set_fs = [] 115 | 116 | if fs_method == 'MI': 117 | term_set_fs = feature_selection_mi(class_df_list, term_set, term_class_df_mat) 118 | elif fs_method == 'IG': 119 | term_set_fs = feature_selection_ig(class_df_list, term_set, term_class_df_mat) 120 | elif fs_method == 'WLLR': 121 | term_set_fs = feature_selection_wllr(class_df_list, term_set, term_class_df_mat) 122 | 123 | return term_set_fs -------------------------------------------------------------------------------- /site/mybzz/test/NltkUtil.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import itertools 4 | from nltk import BigramAssocMeasures, BigramCollocationFinder 5 | from nltk.probability import FreqDist, ConditionalFreqDist 6 | 7 | 8 | def create_word_scores(): 9 | posWords = pickle.load(open('pos_review.pkl', 'rb')) 10 | negWords = pickle.load(open('neg_review.pkl', 'rb')) 11 | 12 | posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 13 | negWords = list(itertools.chain(*negWords)) # 同理 14 | 15 | word_fd = FreqDist() # 可统计所有词的词频 16 | cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 17 | for word in posWords: 18 | word_fd[word] += 1 19 | cond_word_fd['pos'][word] += 1 20 | for word in negWords: 21 | word_fd[word] += 1 22 | cond_word_fd['neg'][word] += 1 23 | 24 | pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 25 | neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 26 | total_word_count = pos_word_count + neg_word_count 27 | 28 | word_scores = {} 29 | for word, freq in word_fd.items(): 30 | pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), 31 | total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 32 | neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), 33 | total_word_count) # 同理 34 | word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 35 | 36 | return word_scores # 包括了每个词和这个词的信息量 37 | 38 | 39 | def create_word_bigram_scores(): 40 | posdata = pickle.load(open('pos_review.pkl', 'rb')) 41 | negdata = pickle.load(open('neg_review.pkl', 'rb')) 42 | 43 | posWords = list(itertools.chain(*posdata)) 44 | negWords = list(itertools.chain(*negdata)) 45 | 46 | bigram_finder_pos = BigramCollocationFinder.from_words(posWords) 47 | bigram_finder_neg = BigramCollocationFinder.from_words(negWords) 48 | posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) 49 | negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) 50 | 51 | pos = posWords + posBigrams # 词和双词搭配 52 | neg = negWords + negBigrams 53 | 54 | word_fd = FreqDist() 55 | cond_word_fd = ConditionalFreqDist() 56 | for word in pos: 57 | word_fd[word] += 1 58 | cond_word_fd['pos'][word] += 1 59 | for word in neg: 60 | word_fd[word] += 1 61 | cond_word_fd['neg'][word] += 1 62 | 63 | pos_word_count = cond_word_fd['pos'].N() 64 | neg_word_count = cond_word_fd['neg'].N() 65 | total_word_count = pos_word_count + neg_word_count 66 | 67 | word_scores = {} 68 | for word, freq in word_fd.items(): 69 | pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) 70 | neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) 71 | word_scores[word] = pos_score + neg_score 72 | 73 | return word_scores 74 | 75 | def find_best_words(word_scores, number): 76 | 77 | best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:number] #把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的 78 | best_words = set([w for w, s in best_vals]) 79 | return best_words 80 | 81 | -------------------------------------------------------------------------------- /site/mybzz/test/Ran.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def getTime(date): 4 | return '2016-12-%s %s:%s:%s'%(date,random.randint(0,23),random.randint(0,59),random.randint(0,59)) 5 | 6 | if __name__ == '__main__': 7 | for i in range(1,3000): 8 | print('2016-12-%s %s:%s:%s'%(i,random.randint(0,23),random.randint(0,59),random.randint(0,59))) -------------------------------------------------------------------------------- /site/mybzz/test/Test.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | 3 | 4 | if __name__ == '__main__': 5 | name = '穿越火线' 6 | f = open('c:/%s_输出.txt' % name, 'r') 7 | w = open('c:/结果/%s.txt' % name, 'w') 8 | s = ['的','了','么','呢','是','嘛','个','都','也','比','还','这','于','与','才','用','就','在','对','去','后','说','之'] 9 | 10 | while True: 11 | line = f.readline() 12 | 13 | if not line: 14 | break 15 | flag = False 16 | if len(list(jieba.cut(line.split(' ')[0]))) > 1: 17 | for word in s: 18 | if line.split(' ')[0].startswith(word): 19 | print(line) 20 | flag = True 21 | break 22 | if not flag : 23 | print('能分开...' + line) 24 | w.write(line) 25 | else: 26 | print('不能分...' + line) 27 | pass 28 | f.close() 29 | w.close() -------------------------------------------------------------------------------- /site/mybzz/test/TestA.py: -------------------------------------------------------------------------------- 1 | dic = {'的': True, '坑钱': True, '挺': True, '就是': True, '好': True, ',': True} 2 | 3 | for f,v in dic.items(): 4 | print(f,v) 5 | 6 | 7 | -------------------------------------------------------------------------------- /site/mybzz/test/TestBs.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | html = """ 4 | The Dormouse's story 5 | 6 |

The Dormouse's story

7 |

Once upon a time there were three little sisters; and their names were 8 | , 9 | Lacie and 10 | Tillie; 11 | and they lived at the bottom of a well.

12 |

...

13 | """ 14 | 15 | soup = BeautifulSoup(html) 16 | #格式化 17 | # print(soup.prettify()) 18 | 19 | # print(soup.a) 20 | # print(soup.a.name) 21 | # print(soup.a.attrs) 22 | # 23 | # print(soup.title.string) 24 | # 25 | # print(type(soup.title)) 26 | 27 | #打印子节点 28 | # print(soup.html.contents) 29 | # 30 | # print(soup.head.children) 31 | # for child in soup.html.children: 32 | # print(child) 33 | 34 | # for child in soup.body.descendants: 35 | # print(child) 36 | # 37 | # for string in soup.stripped_strings: 38 | # print(repr(string)) 39 | 40 | # print(soup.p.next_sibling.next_sibling) 41 | 42 | 43 | # for sibling in soup.p.next_siblings: 44 | # print(sibling) 45 | print(soup.a.next_element.next_element.next_element.next_element) 46 | -------------------------------------------------------------------------------- /site/mybzz/test/TestDb.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import time 3 | import datetime 4 | 5 | from snownlp import SnowNLP 6 | 7 | from site.mybzz.util import DbUtil 8 | from site.mybzz.util import DateUtil 9 | 10 | # conn = pymysql.connect(host="localhost", user="root", passwd="banban123", db="comment", port=3306, charset="utf8") 11 | # 12 | # cur = conn.cursor() 13 | # 14 | # cur.execute("INSERT INTO comment(game_name, content, comment_time, author, score)" 15 | # " VALUES ('游戏名123', '内容123', '2016-05-19 15:56:07', 'ban', '44');") 16 | # conn.commit() 17 | # print("VALUES (%s, %s, %s, %s, %d);" % ('游戏名123', '内容123', '2016-05-19 15:56:07', 'ban', 44)) 18 | # print(time.localtime(1463739856)) 19 | # print(DateUtil.lomgToStrTime(1463739856)) 20 | # statement = "select * from comment" 21 | # 22 | # data =DbUtil.getAllResult(statement) 23 | 24 | # for d in data: 25 | # print("游戏名:%s,内容:%s,时间:%s" % (d[1],d[2],d[3])) 26 | 27 | conn, cur = DbUtil.getConn() 28 | 29 | 30 | # if __name__ == '__main__': 31 | # comments = DbUtil.getAllResult("select * from comment where game_id = 275 limit 10000") 32 | # file = open("c:/穿越火线_输入.txt", "w", encoding = "GBK") 33 | # for comment in comments: 34 | # try: 35 | # print(comment[2]) 36 | # file.write(comment[2]) 37 | # except: 38 | # pass 39 | 40 | word = DbUtil.getOneResult('select keyword from keyword limit 1') 41 | print(eval(word[0])) 42 | dict = eval(word[0]) 43 | 44 | for key in dict: 45 | print(key, SnowNLP(key).sentiments) 46 | # if list: 47 | # print("11") 48 | # else: 49 | # print("222") 50 | -------------------------------------------------------------------------------- /site/mybzz/test/TestJieBa.py: -------------------------------------------------------------------------------- 1 | import random 2 | from operator import itemgetter 3 | 4 | import jieba 5 | import jieba.analyse 6 | from pytagcloud import make_tags, create_tag_image 7 | from pytagcloud.colors import COLOR_SCHEMES 8 | 9 | from site.mybzz.util import DbUtil 10 | 11 | # conn, cur = DbUtil.getConn() 12 | 13 | stop = [] 14 | 15 | def plot(game_name, game_id): 16 | dict = {} 17 | comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) 18 | for comment in comments: 19 | 20 | # print(comment[2]) 21 | result = jieba.analyse.extract_tags(comment[2], topK=3) 22 | 23 | for word in result: 24 | if len(word) < 2: 25 | continue 26 | elif word in stop: 27 | continue 28 | 29 | if word not in dict: 30 | dict[word] = 1 31 | else: 32 | dict[word] += 1 33 | # print(",".join(jieba.analyse.extract_tags(comment[2], topK=3))) 34 | # print('-------------') 35 | 36 | print(dict) 37 | 38 | swd = sorted(dict.items(), key=itemgetter(1), reverse=True) 39 | swd = swd[1:50] 40 | tags = make_tags(swd, 41 | minsize=30, 42 | maxsize=120, 43 | colors=random.choice(list(COLOR_SCHEMES.values()))) 44 | 45 | create_tag_image(tags, 46 | 'c:/%s.png' % game_name, 47 | background=(0, 0, 0, 255), 48 | size=(900, 600), 49 | fontname='SimHei') 50 | 51 | print('having save file to dick') 52 | 53 | 54 | if __name__ == '__main__': 55 | 56 | f = open("../StopWords.txt", encoding="utf-8") 57 | jieba.load_userdict("c:/dict.txt") 58 | 59 | while True: 60 | line = f.readline().replace("\n", '') 61 | 62 | if not line: 63 | break 64 | stop.append(line) 65 | # result = jieba.cut('希望能看见我的话我的破血头啊啊啊') 66 | # 67 | # for seg in result: 68 | # print(seg) 69 | # 70 | # print(",".join(jieba.analyse.extract_tags('希望能看见我的话我的破血头啊啊啊', topK=3))) 71 | 72 | games = DbUtil.getAllResult("select game_id,games.game_name from `comment` join games on game_id = games.id GROUP BY game_id ORDER BY count(game_id) desc limit 50") 73 | 74 | l =[] 75 | for game in games: 76 | if game[1] not in l: 77 | plot(game[1], game[0]) 78 | l.append(game[1]) 79 | # print(game[1], game[0]) 80 | # print(l) 81 | -------------------------------------------------------------------------------- /site/mybzz/test/TestNltk.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import nltk 4 | from nltk.collocations import BigramCollocationFinder 5 | from nltk.metrics import BigramAssocMeasures 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.svm import SVC, LinearSVC, NuSVC 8 | from sklearn.metrics import classification_report 9 | from sklearn.metrics import precision_recall_fscore_support 10 | from site.mybzz.test import NltkUtil 11 | import matplotlib.pyplot as plt 12 | from sklearn.externals import joblib 13 | 14 | pos = pickle.load(open('pos_review.pkl', 'rb')) 15 | neg = pickle.load(open('neg_review.pkl', 'rb')) 16 | 17 | 18 | def bag_of_words(words): 19 | return dict([(word, True) for word in words]) 20 | 21 | 22 | def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): 23 | bigram_finder = BigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式 24 | bigrams= [] 25 | try: 26 | bigrams = bigram_finder.nbest(score_fn, n) # 使用了卡方统计的方法,选择排名前1000的双词 27 | except: 28 | pass 29 | return bag_of_words(bigrams) 30 | 31 | 32 | def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): 33 | bigram_finder = BigramCollocationFinder.from_words(words) 34 | bigrams = bigram_finder.nbest(score_fn, n) 35 | return bag_of_words(words + bigrams) # 所有词和(信息量大的)双词搭配一起作为特征 36 | 37 | 38 | def pos_features(feature_extraction_method): 39 | posFeatures = [] 40 | for i in pos: 41 | posWords = [feature_extraction_method(i), 'pos'] # 为积极文本赋予"pos" 42 | posFeatures.append(posWords) 43 | return posFeatures 44 | 45 | 46 | def neg_features(feature_extraction_method): 47 | negFeatures = [] 48 | for j in neg: 49 | negWords = [feature_extraction_method(j), 'neg'] # 为消极文本赋予"neg" 50 | negFeatures.append(negWords) 51 | return negFeatures 52 | 53 | 54 | def score(classifier): 55 | classifier = nltk.SklearnClassifier(classifier) # 在nltk 中使用scikit-learn 的接口 56 | classifier.train(train) # 训练分类器 57 | classifier = joblib.load('model.m') 58 | # joblib.dump(classifier,'model.m') 59 | pred = classifier.classify_many(dev) # 对开发测试集的数据进行分类,给出预测的标签 60 | 61 | return precision_recall_fscore_support(tag_dev, pred) 62 | 63 | def score2(classifier): 64 | classifier = nltk.SklearnClassifier(classifier) # 在nltk 中使用scikit-learn 的接口 65 | classifier.train(train) # 训练分类器 66 | 67 | pred = classifier.classify_many(dev) # 对开发测试集的数据进行分类,给出预测的标签 68 | 69 | # print(classification_report(tag_dev, pred)) 70 | # print('\n') 71 | return accuracy_score(tag_dev, pred) # 对比分类预测结果和人工标注的正确结果,给出分类器准确度 72 | 73 | 74 | def best_word_features(words): 75 | return dict([(word, True) for word in words if word in best_words]) 76 | 77 | 78 | if __name__ == '__main__': 79 | dimension = range(100, 700, 50) 80 | method_list = ['precision', 'recall', 'fscore'] 81 | index = 150 82 | pos_dict = {} 83 | pos_pre_list = [] 84 | pos_recall_list = [] 85 | pos_f_list = [] 86 | 87 | neg_dict = {} 88 | neg_pre_list = [] 89 | neg_recall_list = [] 90 | neg_f_list = [] 91 | 92 | for d in dimension: 93 | word_scores = NltkUtil.create_word_bigram_scores() 94 | best_words = NltkUtil.find_best_words(word_scores, int(d)) 95 | 96 | posFeatures = pos_features(best_word_features) 97 | negFeatures = neg_features(best_word_features) 98 | # posFeatures = pos_features(bag_of_words) # 使用所有词作为特征 99 | # negFeatures = neg_features(bag_of_words) 100 | 101 | train = posFeatures[index:] + negFeatures[index:] 102 | devtest = posFeatures[:index] + negFeatures[:index] 103 | test = posFeatures[:5] + negFeatures[:5] 104 | dev, tag_dev = zip(*devtest) 105 | 106 | # print('Feature number %s' % d) 107 | print(score2(LinearSVC(C=0.1))) 108 | precision,recall,fscore,support = score(LinearSVC(C=0.1)) 109 | pos_pre_list.append(round(precision[1],3)) 110 | pos_recall_list.append(round(recall[1], 3)) 111 | pos_f_list.append(round(fscore[1], 3)) 112 | 113 | neg_pre_list.append(round(precision[0], 3)) 114 | neg_recall_list.append(round(recall[0], 3)) 115 | neg_f_list.append(round(fscore[0], 3)) 116 | # print('SVC accuracy is %5.2f %%' % (score(SVC()) * 100)) 117 | # print('LinearSVC accuracy is %5.2f %%' % (score(LinearSVC()) * 100)) 118 | # print('NuSVC accuracy is %5.2f %%' % (score(NuSVC(nu=0.01)) * 100)) 119 | # print("\n") 120 | 121 | pos_dict['precision'] = pos_pre_list 122 | pos_dict['recall'] = pos_recall_list 123 | pos_dict['fscore'] = pos_f_list 124 | 125 | neg_dict['precision'] = neg_pre_list 126 | neg_dict['recall'] = neg_recall_list 127 | neg_dict['fscore'] = neg_f_list 128 | 129 | # 画图 130 | for method in method_list: 131 | plt.plot(dimension, pos_dict[method], '--^', label=method) 132 | plt.title('pos svm') 133 | plt.xlabel('feature num') 134 | plt.ylabel('score') 135 | plt.ylim((0.8, 0.95)) 136 | 137 | plt.legend(loc='upper right', numpoints=1) 138 | plt.show() 139 | 140 | for method in method_list: 141 | plt.plot(dimension, neg_dict[method], '--*', label=method) 142 | plt.title('neg svm') 143 | plt.xlabel('feature num') 144 | plt.ylabel('score') 145 | plt.ylim((0.8, 0.95)) 146 | 147 | plt.legend(loc='upper right', numpoints=1) 148 | plt.show() 149 | -------------------------------------------------------------------------------- /site/mybzz/test/TestPa.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib import request 3 | import html 4 | 5 | import zlib 6 | from bs4 import BeautifulSoup 7 | import urllib 8 | 9 | # req = request.urlopen('http://app.flyme.cn/apps/public/evaluate/list?app_id=2897832&start=0&max=10&mzos=3.0&screen_size=1080x1800') 10 | # 11 | # data = html.unescape(req.read().decode()) 12 | # print(data) 13 | # value = json.loads(data) 14 | # 15 | # # value1= json.loads(value['value']) 16 | # 17 | # print(value['value']['list'][0]['comment']) 18 | 19 | # req = request.Request('http://appc.baidu.com/appsrv?action=ranklist&native_api=1&pu=ctv%401%2Ccfrom%401000561u%2Ccua%40gu2ki4uq-igBNE6lI5me6NNy2I_UCvhlSdNqA%2Ccuid%400u-Yu0PYH8jVavuO_a-YagiSS8lvuvu9_a2L80ufvi6kuviJlavefYamv8_6uvtz_a2etxNNB%2Ccut%40rIviC_C0vC_7uLP7NJGCjxNIB%2Cosname%40baiduappsearch&name=game') 20 | # 21 | # 22 | # result = request.urlopen(req) 23 | 24 | # print(zlib.decompress(result.read(),16+zlib.MAX_WBITS).decode()) 25 | # 26 | req2 = request.urlopen( 27 | 'http://appc.baidu.com/appsrv?action=ranklist&native_api=1&pu=ctv%401%2Ccfrom%401000561u%2Ccua%40gu2ki4uq-igBNE6lI5me6NNy2I_UCvhlSdNqA%2Ccuid%400u-Yu0PYH8jVavuO_a-YagiSS8lvuvu9_a2L80ufvi6kuviJlavefYamv8_6uvtz_a2etxNNB%2Ccut%40rIviC_C0vC_7uLP7NJGCjxNIB%2Cosname%40baiduappsearch&name=game') 28 | print(zlib.decompress(req2.read(),16+zlib.MAX_WBITS).decode()) 29 | # soup = BeautifulSoup(req2.read().decode('UTF-8'), "html.parser") 30 | # b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\xd5]ko\x1bG\x96\xfd+\x81\x80\xf1\x17\xc5T\xbd\x1f\x06\x82\x95\x93`\xe6\xc3L\xec 3\x99]`\xb5 \xea)\xd1\x91H\x85\x8f\xc4\xf2 \xfb\xdb\xf7\xdej6\xd5\xec&\xd9lQ\xf2x\x9d\x88\xa2\x9a\xdd]\xd5Uu\xef9\xf7Q\x97\xff:K\xf3\xf9l>\x9e\xce\xce\xde\x90\xaf\xcf\xee\xd2b\xe1\xae\xd3\xd9\x9b\xb3\xb3\xaf\xcf\xe6i\xb1\xba]\x9e\xbd\xf9\xd7YtKw\xf6\xe6\xbf\xab7\xcb\x87{8\x81~}6Y\xa6\xbb\xea\x93\x7f\x9d-\xa6\xee\x0e/\xbbZ)\xe3\xd4\xd5J&\xed\xafV\xc6\xea\x0cG\x127p;w\xebb\x9cL\xc7\xf9\xd6]\xc3\x99\x14\x0e\xc5Y\x98Dxo\x99\xb5F\x128r=\x9f\xad\xee\xcb1\xae\x99\xe2\xdc\xc2\xb1{\x17~\x81N\x95\xa3T0\xa5\xa9\x80\xa3U7\xce\xae\xb1\xdd\xaf\xcf~K\xf3\xc5d6]\xf7\x82\x8e\xb4\x18\x91\xc7\xc3a\x16\xcba\xa2\x05\x1e\rnY\xddM\x10\xb6\xfes\xd3}c#\x83W\xa6\xe1U1\x95\xe05\x97\xf6\x16\x93Ox\x06\x17|d\xe9\x0f\xdf\x96\xde\xff>\xbd\x9d\xb98^\xcdo\xe1\x93\x9b\xe5\xf2\xfe\xcd\xd5\xc5\xd5\xc55~0\xf2n\x12W\xa30\xbb\xbb\xba\xc0A\xba\xba\xf8}\xb2H\xd8\xdb\xab\x0b\x96\x05\xcf\x92\x10!d\xd0\xca\xf1\xab\x8b\xbb4\xbd\xbeY\xb9\xe9\xc7\xc9\xc3l5.\xfd\x1c\xb9\xfb_\xa0\x91\xd5=\\\x9d\x96\x93\xd2=F\xa8zM\xc4k\xa6\xe0\x93I\x98M\x9b\xcd\x868%\xa4j\xf6\xf5\xe4\xeez\x14\xa6W\x17p\xdd\xf5\x7f\xfc\xb6p\xf7\xf7\xaf\xb0\xff\xdfx*\xc9\x18~^\xc1q\x1c\xc0o\xf8\xab_W\xeev\xb2|\xf8\xc6\x90Wi\xfej\x91\xc27\xe4U\x9c|\xa3\x9cr\xdaH\xa1\x9d\x912\x10\xc6\xa5\xf0\x94+\xae\xadR"h\xf3j\x9e\xf27\xd8\xf6\x9f\xf8\xdb?\xb1?\xc3\xffnt3\xb9\xbf\x99-g\x8b\x91\x8f\xa5\x03\xb3\xbbW\x8by\xe8?\x0b\x0e\xd7\x83\x03o\xef\'\x01^qq\xc1/\xea3\xc9\x9a+\x97\x034\xcda\xfa\x05\xcbN\xf1\x14\x83\xf0ZH\xc1\x9d\xa5T\xca\xd1\x87\xfb\xeb\xe6\x94L\xa6\xd34\x7f\xb1IYL\xae\xa7wQ\x96\x15$\x8c\x82\xe5\xcb\xcc\xe3B\x85\xc3\xd0\xc2h\x9a\x96\xc9-\xd2\xe8\xee\xa1j\x16\xaf\x0b\xb39~\xacm\xfd\xc78\xccV\xd3e\xb569\xa7ELn\xc7\xf5c\x94\xe3\xd4\\\xadD"\x1aV\xa2g\xb4\xbcG\xd1\xca:b\x93\xb7n\x99g\xf3\xbb\xf1z\xa5\xe3"\x19q\xfc`v\xbf\x82Q\x88\xe9#\xca\x17\xae\xfa;7\x85\xa9\x1e/nf\xf3\xe5\xd8\xcf\')\xc3\'?\xfc\xf0\xfe\xa7\x1f\xff\x82K]\xf8JRAvc\x90\xf0\x1a2\xc1\xc6\xf0\xb8\xc82\\\xadr&\xd8\xc1\xe5,\xba\x87\xf1b9\xff\xbe\xd1\xcbM\x17e\x92\xad.>\xa4\xc52\xcd\xf1\x9a\xcd\xecT\x12N\xa5\x90(\x7f\xb0:\xc7\xd7\x93\xbc\x1c/\'\xcb\xdb\xb5\xee\x89\x93\x05<\xdaC{$\x84\xb6\xd8\x85\xc5\xea\xfe\x1e\x9f"\xc0Z\x82O\xc8\xa6W\xed\x16\x84(\x13S\xcd\x15\xb7Ve!b{\xa6\t\x8f\xdek\x87\xa3\x16\xd3\xd2Mn\xc7\x1e&\x125\xd14V\xbdA\xd9\xf1go\xb8"\x8ar-%4\xf7a\xad{\xc6k\xdd5^k\xa6\xf1\x01\r\x98\xd7\xd7\xcc\xdd\xf4\x97K\n\xff-f\xabyH\xe7\xdf\xff\xfc\xfe\xaf?_\xfa\x99\x9b\xc7I<\xa7\x9c)uy?[\x9c\xd3\xcb|;\xfb\xfd\xfc\xdb\xf7o\x7f\xfa\xfer\tW-\xce/\xef@\x95\x9d\x7f\xf7\xe3\xdf/=\x9eJ@\xe2I\xf9w\t\x13\x02\x8a\xe2\xfa\xe1\xbc\xf4\xca\xc57\xef\x7f|C\xdf\\\xba\xf9\xf5o\xe7a9\'\x97\xb3\xf9d\xbb\t8P\xb5\x92>\xde\x9f\xe7\x15\xac;l\xeer\xe1\xee\xeeo\xd3\xb9\x8b\xbf]\xae\x17\xf4\xf9\xae\xe5|\xb9Hn\x0e\x13\x10\xcf%\xb5\x9a\x14\xf1PD\x10\x98!)/a\xc6\xef&SXn\xa8e\xce\xc3\xed$\xc1*\x87\x11\x98\xcc\x17\xcb\n\x00\xb2\xbb]$\xd0b\xb0\x1e\x97\x9b\xbf\xeae\tC\xc6\x1d,E\xc3%i.\'x\xf5\x11\x16\x98\x96\x0c\x17\xaaH\xea\x7f\xd7k\xfd\xd6\xcdA\x8b\xd5k}\xb5H\xd3\xd5]K|D\xf2\x0e.\xd2x\xa9\x06\xbd\x81\xaa\x7fv\x07R\xbe\\\xd4\'\xa3\x0cb\x1bA\xe0\xabw\xb8\xb0\x9c_\x14\xec[\xe3\xde\x1f_\xd7o\xd9\xe3[^Mn\x85$\xbf\x8d\x11W\xfe\xf8\x1f@P\x98\xe8\xc9\xf4\xba\xdc\x9d\xfe\x81\xa7\x1f\x01\xa02\x13\x82\xaf\x81\xc3(Dc\xeaW\x91\nB\x1e\x82Q\xc1\xacT\xba\x05\xa3\x9a\x13\xd9\x81Q\x9c\xa6#`\x94\xf3\x0e\x88\x96C\r\x04\xa5m\x04\x15\x99\xc2\x18Z\x99\x01A\xb5\n0\xe4Ji\xd7@P=\x92\xe24\x00\x85\xa1\xd06$\x16\xb9M4({u\xf1\x8b\x9b|\x9c\x80\xa6v3\xfc\xb9Mc\xce\xf7\x03\xa8|\xcd\xf8\x8b\x03\xa8T\x8c\xbala2-h\x18\xc01\x13\xa5\x81\x99$\x12tMN\xbe\x0b\xa0\xf9(\x00\xddu\xd6\x01\x00\rA\xd2\xccc\xf2\xc2d\x13$L\x0e\x03\x18\x17\x89\x07\x00\xad\x98e\xa29\xe9\x9c\x9e\x01@\x87L\xca#\x80rn\x94d\x164G\x07@o`\xd4\x1f\xd2m*\x12:z;\x8d\xf3\xd9$\xbe\x9dN\xee\xdcm\x03K\x8d\xd8\x81\xa5\x84\x96%\xdf\xc6\xd2\x11/Z \x1f\x0f\xa5\xac\x03\xa5z\x1f\x94\xc2M\x83\x83\x9b\xea,\xcb\xad%4cH\x8e\x95\xb2\x01\x89f\x1c\xe5Z&T?\xca\x88\xd2\xbc\xad\xa5[[\x99*\xa0\xdd\x03\xb1\x8c?\x05c\x99\xa4\\\xd0\x01 \xcb)e\x9a\xcb!(\xcb\xb8\x92\x85\xa8W3\x1a=\t\x81\'\xd6^\x0fJjn\x189\x02f-\xdc\xd0Z\xbb\x85\xb2\x95j\x1b\xaf\x15\xd7\xb8WM\xb6\xb0\x96\xd5X\xfb\xee\xed?~\xfe\xe9\xed\xdfv\xa0-\x1b\x8eh\x97\x8b\xfb\x14\xceI\x032\x0fa\xdcr\xbez&\x88\xd3[\x10\xf7\xb8\xaa\x8f@8\x94\x8cgA8\xdaA8v4\xc2Y\x89\xad\xc7H\xf1\x01\x19,i\tL\r\'\x93f\xf1\x1a\x06FX$\xa0\x82\xa0\xec0\x11\xf0\n<\x0b4j\x9f!\xc9\x95\x01}\xdbB@a\xb5&m\x04\x84\xd5$\xb4\xeeE@@\x04\xaavX\x92\xa4\x1c=\x05\x07\xb5\x19)}\x1a\x0e\xa6\x1c\x81\xfa\t\x9d\x9c\xcf\xd6\xc3\xda\xb9\xf0iz\xeff\xde}\x84\xbe^\xc7\x89XL`\xd1^\xdf\xbb\xc9\xb8t\xf9\x10&R\xf2\xe2\x98\xe8\x82#"Pb\x92e\xc9\tc\xb9L"+\x83o\xb4\x88\xb9\x8b\x89\xe9(L\xdcu\xd6\x01L\x8c2G\tb\x1f\xe1\xc9\x89\x05\x8bR\xc7\xacS\xb2\x91\x03\x01\x94\xd9%\x1d"\x17"\x9d\x8e\x89O\x9d\xa0G|T\x84B\xaf\x94a\x1dx\xc4\x16n\xd3\xc3h\xbe\x9aN\xe1\x16\x80I\xbcch\x1a\xda\x01GI\x04\x17]ld\xc4\x90\xa1vf\x17\x1c\xd9~p\x94T\xa1\x8e\x0e(\x14(\xe0\xff\xd5\x94\xea\xca\xb8D\x99Q\xa8\xd4\x01.\xf0S\xec\x8b\xb4\xa8\x12E2\xaa\xc6\xd7\x0cZb\x9f\x19\xca\x9ed\x87r\xc6\xcd\x00\x8c\xc4\xa1\xe2\xa2h\x94\xa3\rQ\xa6\xa5iX\xa2\xb0\xec5\xd5\xbc\xbd<\xbc\x84\xd9aez\x0eC\xa4a\xc2\x12\xae\xf4\x16DV\xbao\xbc\xd6l\xe3\x93\xf5l\x0bBy\x9f\xb9\xca\xfb\xcc\xd5\xfd\xb6*\x7fC\xe9\x11\xa6*?\xceT\xdd+\x19_\xac\xcd\xca\x9a\x80\xde\x14\xc5^D/\xe2\xfcB\x80\xce\x8f\x05t v\xf8\xb0\x84H||\xc5\xcb\xfb>c\x95\x1b\xab\xadiA5PF\xdb1V%cD\xd1^\xa8\xa6lDG\xba\x8b\xd5\x0c\'\xfaD\xb7/\xd3d$\xe5\x89V+\xa3\x04\x1a\xb0L\x86L\t\x88\xe6\xc5\xefnz\xfd\xe9&\xcd\x01\n\x1e\xdcl\\\xf5\xf4 F\xbf\xbc\xddji\x04\x8a\xe88\x98\x86V\x92\xe4\x89\x8bD8\x93\xb9\x80\xbfh\x96]\x8c\xbe9\n\xa3w\x9du\x00\xa3%\xc9\xce0\xe3\x9d\xa2\xc9\x99\x08\x96\x03\x17\xc6\'j\tq\x9c\x080/\xa0/`\xd0>\x83\xdd:hZ\x1e\x91\x99Y",5D\xd3\x0e4\xc3\xc2\n \xa2\xa3\xe5\xdd\xf5\xfdh\xb1^\xae\x1b\xe7\xaf\xec`2\x93\x9c\x99.&\x03.\xb1\xd3}\xbfr?&s\x18\xcc}\xd2[>E\x9f\xb0b\xc5\xb2\xc0#\xca\xf3\xb4Ar\xe9Ie\xc8\xca\x7f\xe2\x85,#\xdbU\xa8\x86\xe0\x84P\x1c\xa0p+\t\xc8Vc|\xd3\xc9\x0cx\x0f\xe7\x18\xaa\x13\xaa-\x07MX\x1a\x10\x87`N\x7fx\xff\xed\xdb\x96K\xba\xe0\xd2\xa3\x1b\xba4\xd9\xec\xb7\x0c\t\x1dw\x81\xeb\xd2\xef\xea\x04\x15\xb1O*\xe1C\xae\xef\x904\x1e\x01RQ\x9d\x86G\x90(H\x1e\x1c\xff\'\xaf\x0e\xd2\x7f\xa2\x15\xef\x05\xda\xec\x96"o7x[\xe5p\x06\xc0\xec\xdc\x0c\x87\x95,\xeda"\\<\x85\x89HB\xac\x1c\xc0Dp\x81\x10&\xf8\x00&\xc2\x85j2\x91\x18\xb81Q\xaa\xb6\x10pa\x99\xd5\xa9\xdfXg\xb0b,\xe8m\xb1ME\x8an\x1f\xaf5\xf7\xf8\x00B\xb4H\x86\xe8\xb7\xd3\xc5\xff\x1f;]6a\xbd)\xcd\xbd\xb0^4\xc2\x0by\xa2\xc5\xd1\xb0.Q.5\x0b\xac\xf6X).\xe1\x91m\xd0\xa1\x07\xdca\xda\xb5b\xdb\xe0\xae\x90\xb2v\xc1\x1d,1\xd6\x07\xeel\xa4w\xc6s\xe1\x1f\xe7\xa4\x15\xd3\x1dl\x893>"\xea4l\'.X\xe8I\n\xc2S\xaf<\xbd\xba\xc8iz\xfd\xcb\n\x90$&\xbfz\xc0\xf8\xe1\xba\xb3\x87\x02\xbb\x85\xa7\xbc,\xbe\x07g\xc0\x08\'\xdcf\xe7xd\xd6S\xc1\x95s>\x8a\xac\xbd\xd6\xac\x8b\xef\xe1(|\xdfu\xd6\x01|\xcf\x06\x98k\x0e\x0ct\x0f3\x92\xcb\xa0\t\xe8\xda@\xadJ\x19\xe6H9\xf8\xad\xb8\xa2\xa7\xe3\xfb\xf0\xa9y\xc4xM\x19\x1aj\xb4k}\xfb8\xfan\xee>=\xfc\x19\x94F\x03\xdc\x95\xeaz\xa3\x15\xdf\x81\xedR\x8a\x93\xa1\xdd\x1c\xf2E\x83\xd8\xe2\xedTA\xab\xacv\x01y\x85}k\x94\x8cZ7\xe5\x1b?\xe5\x0e\xd5t\x08x-\xdeg\xed\xb5nHS\x85\xce\x8a!&V\x88\xde<\xa2%\x87G\xb2\xa0\x08\xe0}\xf0\t}\xe3&m\xb0\x9f\x19\x86\x08+*w<\xbe\xcf\xc9\x17\x08f\xdb\xbd\x02\xb6\x17\x8b\x9d*\xd0O^X\x02\xc7W\xb8V\xd7\xbd\xd5\x01\xc3\xad\xd0\xff\xf2\\h\xd1JR\xb1\x05Z\x865b\xaf4)\xbe\xcf\x88\xca;r\xc4wXa\xa5-]k\xe3\xf5\xdd\xb2\x13u\xbb\xeb\xe7*w^\xfb\xf0C\x89\x12\x92\x1c\xf7\xa1\xff\x93\xdc\x10\x80\xb8\x03\xb0\x1f\x16\x10\x05}:\xc4\t\x01\xc3%7\xc8\xaf\x00\x1aD\xb2\xb6-\x1e0\x90\x8c\n\x92\xf7#?t\xa6\xac\xedM\x07\xcb\x9a\xc0\xf1%\x894\x98\x81\xa0ZJC\xb7\x83\xe5\x05\x17\xc6k\xad?\xee\xc5\x98\x16=\x905=\xf8\xe1\xdb\x9f\xdf~\xff\xe3\x0ev \xbbN\x88(X\xf2R\x00\x97\xf3`_2m3\xf1\xc0\xb3\xec\xd7T\xc0 hC\xa4.A\x05C*wE\xb8\x8f\xc5]\x01\x06\xcf~W\x84<\xce\x15\xd1T\x13_\xaa\xf7\xc1l\x85\x13\x1a\x8a\xa9?\x9c\xa0\xf8\xf3\x90\x94\xae\xefA\x1e\x1f.G\xa7\xa0d\x18t\xb3\x04-w\xa0\xfa\xb6KO\xc8\x16=\xd1\x9a\x90\x16=\x01=/\x85n\xd3\x13M\x81\xcb\x1e\xe1{\x18\xf1]\x04\xc5\x9c\xe8v\xd0t$\xf9i\xcc$\xf0\x94\xa8\x924\xc8\x18@\xae\xe5\xd5\xc5\xc3\xea\xc3\xc4M\xc1\xc0]\xc2\xaf15\x073\xcd\xe8\x8b\x13\x12\xc5\xbc\xd0Ld\x13m0Z*\x0f\xbc\x84[\x89\x88l\x8cN\xea3\x11\x92\xc8%\xcd\x86Z\xa3(\x18(\xdaI\xe3\x95\x02\xf4\x033\x08l\x97(<\xa7!IFN\'$G\xcfH\xc3\xd7\xa0\x0c\x8c\xa5fLv\x88\xc8\xef\x0f7\xa3\x87\x0f\x9f\x96]\xdf\xbf\xeeP\x11aw0\x11\xcd\x87\x12\x11\xda&"\xea\x80\x8f\xc1\xd0\x8cn\x80\x80F\xb8u\x05\x86\x13\x91\xfc\xfb\x9a\x1e\x98\xc8\xcc\xdb\x92\x83&R\xb4\x08%\x8e\xfc\xf8\xd7m\xf3\x1fV\xa3\xa8_+\x9f\xc5n\xa9\xaf}\x16\xda\x194\xe1M\xb1\xc23#5U\xd0)dtO\xe8\x12\x89\xf0\xa8\xbb\x8c@:\xa4\x15\xda\xa5\xc6\xc5:\x10!\x0be\xd2)\xc5\xed\xab*2\xb0v\x1a\xc0\xfcUw\xc0\xfe\xd6\x9f4\x15gH\xb5\xeb\xa3"S\x08>\xb5\xb8\xe3\xf9_\xa1\xc7D\xa1\xc7\xc4\x1b[;V$\xc7S\xd7MXZEBD\xfdj\xf7\xb3\x8e\'%\xe1i"\xcd\xf1\xa4Cse\xcc\x10w\x834\xc5:\xabV\xb1IJ\x86\x1ci[\x00\x0c\xf7D{\x1ez\x9d\r\x1a\xb4\x02`\xf5v\xd8\xa3\xd2\xe5\xe3\xb5\xa6\x1e\x1f@\x84\x16\x99P\xbddBu\xc9\x04\xacfb\x15\x89\x84&\x92\x840J\xd8\xe08\x13}d\x82\xdb\x03dB\x1dG&\xb6E\xfdK\xa5\x13j\xcb\xeb\xf1\xa8]z\xd9\x04h\xa7\x17\xf2x\xa8\xa3=\x1e\xca\xa0%\xe0\x91\xfd\xd7\x82kv\x90\x89\xed@\x06\x17\x9c\xb5\xb3\xee\xb8\xe1\xb4\x91\xd2\xb4&\x13\x9cj\xc1z\xc9\x04/\x91\xd5m*a\xe4\x89T\xc2\x8a\x915\xa7Q\tJ5K\xf0\xac\x99\xc7l\xd1\xdf{\x81\xe9\xd1\xd7\x8b\x9b\xc9\xa7\x9b\x02_\xd3\xeb\xb1\x91\x87\xe2\x17\xe4\xe5\xe3\x17Z\x86@\x892\xcc3\xa7\xbdV1B\xb7\x9d\xf1\x86\x00z+\xb1#~\xf1"yw \x89Y\x9b\xc7\xe4u\x16\x00\xca\x08\xe6\xe1Q\x9b|\x95\xbcN\x19{\x86\x1c\x83A\xb3rT\xe6:(\xf7\x04z\xf1\xf6v\x14n\xdd\xe2f>{p\xb7\xa9\x9b\xc4\xce\xbaq\x0c-w\xa4\xb0Khg\x18\xc5\x10#\xb2\xc3\xdb\xb17\xb9\xe0\xbb\xf7\xdf\xa1\x8e\x8f\x05\xe7}i\xcb\xa1\x18\x8b@+\x9c\xc5\xe5W\xcc\x7fR\xc7-$\xb3G\xe5\xb5K\xf2$S\x9eS\xab\x86\xa4\xdd\xe1\x10\x81Ij\x86X\xf3\x12]Vz\x03\xae\xc0YcH^\xb5\x97C`\xde\xc3\xf2\xdbg\xcfo\xd6Y\xdc\xb1\xbe\x9b\xcb\x0cV\xf7\xd5\x05.\xee\xab\x0b\xe5\xb3\x93\x94G\xa6\xac\x966%\xb0\xe9Et\xd4{\x99\xbd\xe0T\xa9\xc8X\xcc|\xb3\xb6\xbb\xfe\x02\x15\x91\xfaI\xe6L\xf3\xf3\xc94\xcf\xaa\x8f\xcb\x1c55\x9a\xccH\x8a$\xc3,7\xb8\xac$\x89 g[;g\x8a\xaa\x96\x01\xb9\x9c\xd6\x0e\x8f\x90\xca\xf1URHx4\x8f\x1c\xc2\x82\xe5\xaf\x04S\xdb\xf1\x8a\xa2\xc2\xc7k\x05=>\x00\x04-\x0e\xa1k\x0e\xf1\xf3\xbb\xbf\xbe{\xff\x9f;8\x84\xeer\x88w?\xff\xedo\x15;(\xefJ&\xff^n\xa0\x8f\xe3\x06\x07d\xf6K%\nf+\xeb\xa1\xa9%\xfa\xc3#Z\xbeT\x9e\xbe>\x8e+\xfc\xe4~K\xd37\xe5\xe9\x8b\xda\xa9|\xa5D\xf6\xc5E\xb4\xa0%\x06\xb2\xe5xP\xd64\x82\xc85W`\xb4\x84\xfb{\xf2\x13wp\x85V\x86\xfep\xaf\x830#r\xa2\xdb\xc1K"#\x18Q.\x9b$\t\x83\xcf\xca\x88\xdd\xaeR\\\xcd\x00\x94\xc6\x07\xf2\xf3?\x8b\xdb\x81\x87\x14l\x0e\x8c:\xc2\x83\x14\xd6\x13\xe1X\x16)\x08\x95\x8c\x8c\xf63\xb9\x1d\\\xa4\xdeF\xaar\xcc\xcee\xb0\x89\xa8\x86%"\x8d\xf1\xd0\'0\x023\xa5Y;\xf1\x0cy\x0eG\xcf\xc8\xa0\xddms\xbcG\x87\x1bTH\xb6\x9dwHw\xf8\x1e\xc4@b\xc0\x80\x1bw\x88\x81:\x90u\x88\xde\x82\xf2\x98%\x12\xe0\xba\xf2\xba\x9d\xe6P\xe5\xee+E\xf9v$\xa3\xcaM\xc0\xd4\x98b\x99\xc3\xa7\xa8Ck\xf8\xb1:\xa1\xf5\x0es\xf8.-\xef\xdc\xdc\xdf\x96,\x89\xecj\x0eR9\xb6\xab\x14\x04\xa9\xb1\xe9*("9\x06Q@Q\xea\n\x0ckF\xd2M\x94\x90\x8c+\xbc\n\x8f<6\xf7\xf6\xfe\xfe\xab\xbf/a\x94Q\xcf\n\xf4\x9a\xcbBq\xac\xc5^I\xdczf\xb2/\x11\x11[\xc53\\\xa1Af\xb3\xb1/\x89\r\x88\xae\xc3\xf2\xdc\xeb\xbf<\xa7\x83Ap=`\x93\x1f\xd8\x80Uz\xc2\xb14HP\xda\xc8f \xd6\x81\x86\x95\xed\xa5\x1e\xb9\x88^\x07\xd6\xef`0\x02F\x81r\xdb\xf20\xa0\xd2\x1e\xafU\xf2\xf8\x80\xeao\xb1\x03\xd3\x972izw\xf8\x8d`-r\xbd#e\xf2\x1d\'oHs\x7f\x1f\xbc\x94\x0cK \x9fj/\x9b0\xc36\xfb5\xa4\xfbKe\x11j\x8bE\x88\xe39\x84\xa4/\xe5n0\xc7\xba\x1b@\xc1\x93\xe2\x07E]\xc0\x02j\x01\x93|-\x9c\xcd\x94\xaa\xca\x8bi|\x16}\x04\x832[\x12\x88\xb7\x12/\x98\xd0\x9d\xc4\x0b\x8e\t\xe7\xfd\xce\x08\xb6#\xae\xc1[\xe9\x94\x833.$\x19\xd9\xd3\xf8\x856\xd2\x01\x8dq4\'\xcf\t\x0c \xee6\xfb0q\x0f\xb3\xb5\xe9;y\x80\xe9(Y\xf6c~ \xa7R\xbc&\xe6\xc5\xb9\x86\xd7$i\t2\xa3\xc0H\x02(\x11AD@\x00\xee\xbcc\x86\xa5\x1d\\c\x975\xd6\xe5\x1a\xbb\xce:\xb4\x170\xa9\x14c\x10\x92Fgh\xf0\x9aY\x03\x8b\xc5J\xf9\xb8\x17\x02\x94\x1f?\x9dk\xe9^G\x18\x04\xa9\xf0\x9eW\x16*4\x9c\x9c\x7fN\xe4e\x80\x8dC\xb6\xfe\xc1x\x10\xc1\x07\xf9 \xb0\x1c\xc4\x06}\x03c\x00\xab\xe8\x04\xdf\x9e|\xe6I\xf6:\xf7\xe7\x12J@:E\xda\xdb\xeb\x8bB\x1b\xaf\xd5\xd5\xf8\xc9J\xb3\x05\xcd\xb6\x0f\x9a\xed\xd3w3\xd8\xe3v3\xd8\xe3\xb0x\xe7\x8a\xffR\xd1x;\x97\xa0)b\xfd6=\x8a\xe9\x0be\x13\xd8c\x11\xd9D\x83\x0fX\xfc\x87G\';r%D\xa9\xdd\xb1\xb5\xe9\xd0\x08\xda5\xea\x95"\xc5\x98\x1an\xd4\x97$5\xcaNOv\x14\xa3\x13\xf71\x10eB\xf443\xb0\x9b\xd1L\x05\xb4\xbcq\xb3\x0f\x93\xc7\\:\xec\xe6\xc10\x80}q\xc8\x8d`\x1eQL1T\xc0\x9e#"\x9ccI\x06\xe9r\xca\xa0\xa1\xe8g\n\x03\x18\x15c\xd2\x94ZR\xa55\x82e\xc7-\x8f\xca\xba\xb5\xc9\x9f$U\xda?C\x18`\xd0\xac\x1c\x99\xe1\xf8wt*\xb63\x1cI\x07`y\x17\\\x87\xdb\xf5md-.\x84=\xc8\xfa\xe5$1J\x9er\xdd\x87\n\xcb\xad3\xa1N\x144Fb\x7ft\x15\xd0\xc7\xbc\x85f\xa2\xa3\xa2\x14\rt\x1b\xec&\xe9QeQ\xa7)\xae\xaf*N\x0b4\xca\xf1*\xabq\xedM\x93Zl\x8c\xf6N2c\xb5q\x01\x9e\x1bmS\x8f\xcf]{\x1f\xa4\xaa\xfd\x11\xbcJ~z6\xa6\xa1\xa4\x1e\x90D\x00\xd4\x8e\x0f\xa9/\xa0\x84}\xac.@\x12\x8f\x99\xdb\xdc^\xeeQ\xd8\x9c\x81t\xf4\x92\x0c&\x95\x95\x9cl\x93\x8cJ\x83\x8f\xd7\xfay|\x00\x07\xda5|Ho\x12\x01\x9c\xd2MIL^\xb0\xc4\x85\xa29r\x0e\x8f#rb\xb1?\x8b@\xb0\x03Y\x04\xd0\xd0\xb19\x89\x1b\xc1\xfeRyD\x91\xfe\r\x8f8\xde\xa8\xe7/D (9\xbe\xb8\x01r\x9e\xb5\\\x1e[\xff\x8esit;,\xc0\x8d6\x9d\x14\x02\xc9\x88\xb2\xfd\x85{\xe8\x88\xcb.\x85\xa0\xb2H\xd2I\xf5\xef$\x1f\xd9S\t\x04\xb5\xd9\xb1\x0c?N8\x0fJ\x0b\xce\xb8Y\xb9u\x9d5\xec\xe4\xbf9\x87\x808\xcd\xa1\x7f\x92\xc2O\x922\x87h\xb1\xb8\x88\xf0\x02t\xbc$\x9f+\x87 \x80\x86\x90\x163\x19\xbc\x8e\xc4\xb1@\x8d\'\xde\xe6\x10\x90L\x10+<\x07"\xa1\xd53\x90\x87cgdPl \xde|\xdcQ\xfbnG\xbd\x1e\xb8\xcd\xae\xe0\x00\xee\xf9?u\x8b\xc4\x81\xcc\xc4\xa6\xb3\xbf\xd6\xf7\x8d\xfc@\x12\n\xfe\x95m\x80*\xa6\xdabGO|]5o\x9d\x8b\xb8S\xe2\xd9&\x19\x11\xc6o\x93v@M\xd9\xbd\x80\xef+\x00\xaf(\x85\xa1\x08\xfcM\xaa\xd1\xfcTf\x0c\x8dVU\x84T\x88\x8f\x1e\xfb\xb2\xab\xa1\xaa1d\x95\xc9u9\x1c\x910l-yxt6\x94*\nZ\x13\xff\xd5\xe6\x0f\xb8\xdd\xe6\x0f\xa0\x1c\xbeuE\xe0\xb1\xd6\xb4\x87\x1es\x13?\xd9\x84,\x9e\x93XH5\xa40\x03,\x18\xc6\xcc\x90\xba\x0c\xb2\xf2w\xac\xfd\x17\xc0\x8d\x80\x92\x89\xb608\xe9\x1c\xcfY\xf5R\x0b\x0eS]v>\xb6r\x0bP\xb7\x8f\xd7\x9a{|\x00!\xda\xdc\xa2\xbf@`o\x85@2\x02\x18Q\x8c\x0e\x89\x1f\x80\xb1\xb4\x9fc\x0c\xac\x16\xf8\xa8\x02\xbeT\xa6a\xb6\xd2\x15\x1b:\xa7\x7f\xf7\x03\xaa\xad\x17\n!\xd0\xa3\xcb\x05V\xb6\x90\xa9,\x1fV,\x90\xe4}\xc9\xb8V\x87Y\x87Q\x18\xeejU\xdd\x15\x16\x8c\xc1\x86J\xaf+u\xd2\xaa&\xc8!\xd6\xc1GX+\t\xa8\x87\xe90\x0f}j\xc0\x80\xf1\x11;\xb1X\x12\xe1*$\x1aA]Z\xe7\r\xcc\xea\xc5\xc3\xe4\xd7Itw\x0e\x93\xef\xaf\x17\x93\x80ys\xeb\xbf\xc6\xfa@\xcc\x80\xbe.\x8c\xe8\x85\x1d\x18\xc6\xd3`\xa2\xcc>p\x91\x18s\xc9\xc7@\xc0l\xe0Za\x11\x84\xcf\x153\xf0\x91\xe7\x08Z\x91\xc2\xd2\xd0\xc4\t\xefA\xb0\xac\xb4<\xb8\x08\x1dt\xca(\x19\x9f#?\xe1\x89\xf3\xf3\xc8H,\xc3\xdd\xf3\x82t\tI1\x9aW\xa3\x87_\xe3\xdd\x07x\xea&\x1b\xe9&*X\xab\xba\\\x84\xb3\xd3=\x1a\x85\xcc\xef\xdb\xae\xe9U)\x96\x80\x85\t\x80m\xa0\xfc\xc6\x82\x0f\xd5\xfe\x84*\xbf\xad\xda|\x98l#\xef\r\xe5]\x97\xdc8[\n\x0bVE\r$c@\x1b\xac\t\xa9\xee\xa8U\xe9\xb1\xe8\x82/D\x82\xa0\xc2\x84\xf1\xc2m\x06\x0c\x93\x19\xb6\xfc\x17\x8d><\xea\x93\xda\x97`\xa2\xc3\xde\n\xcdk\xcf\x05S\xa5\n\xc3\x9e\xab\xd6\xfb>\nE\xc0\xf2M\xfb\xcf4\x1c\xf7T\xd8Ds}f\xb5\x99S{\xb7)\x0fU\xa5j\x98@h\xc3\x0b\x83^\x12\x8b\xbb>\xa4\xc4\xe4\x07\xcdp\x8b&\xb6\xb2\x87\x8e\x00,\xb3\x81\x89\x0c\xa4P\x8b\xa3K3X\xc0\xb7!\xdb3\xb9-;+\xaa\xb5\x9c4P\x1f\x93S[(4\x15\x91\x1bo\xfa\xdd\x1cB\x10`\xe8[Td\xad\xf0\xc7ku>\xee\x05\x8f6!a\xfd\xce\x0e\xd6e$\xd2\xb3(<\x8d\xd1k\x9eD\x84U\x90)\xc92\xf79;\xa8:\xe4\xec`\xc7\x11\x91\x96\xe8\x7f\xb1,D\x9cm\x95\x8a8\xde\xe3\x01\xea\xea\xa5|\x1e\x03\n:2[\xbf\x0e-YL\x04\x97m\xcf\x87bZwb\'\\\x83\xfd\xd2\x97\xaf\xc0Fv\xc4v\x10\x10Q3\x8e\'W-f#ub\x8d\x08\xe5I@0\x95.j!\rGv\xe0\xa6\xf8\xd3(\x91+\xe8\xc1\xfaO/_\x1f\x02\xc4\xd1@k\xc9\x81"UT\xd2l\t#`\xe6;O\xc0V\xf4\xe23\xe5Er\xc1\xc0\x1a\x8b\xd9\xd9\x04$\x03\x13\xea\xa3c\x92\xf8L\x08\xb7\xd4dGS\x00\x04\x88\xa7\xf3\x8ea\xd3\xf2H7\xa4&\x86\x9bR|\xa7U\xb7x\x92W\xd3\xd1ryw\xf7\xb1E6\xba\xc5!L\xe5\x91ko\xc9T\x03k1\xf2!\x99\t\x94\x90:\x17\xb0\x19\xd0X\x87\th\xa9\xdaTB\t\x15Q\xa8@\xb7\x02\xd4\n\x8c--\xf2\x01\x8a|\x0f\xb8\x1aj\x86F\x10\x04\x1b\x90%\x88\xce/Y\x92C\x8e6\xf4%\x7f\x04W\x19auy@\x9f\xd6\xcc\xe7\x00\xe2\x9f\x8b\x8e\xed1\xf4\x05\x13\x92\x89\xed=\x04\x95&\x1b\xaf\xf5\xd4\xb8W+\xb6\xc1\xb5\xb7\xbe"=\xa1\xc0"=\xb2\xc2"=\xb2\xc4\xe2\xf6\x1a\xffbQu+\x1b\xa1!U\xbd\xa8\x8ar\xf9R\xb0zdY\xc5\xb2E\xba\xd4\xc5\x00y+t\x1e\x9d\x90&\x03\xfd\xc5\xda\x9d\xda\x94\xed\xad\x85L3\x11\x8b7\xd0\xf7\x80m\xa9\xec\xdd.\xb9H\x95\xa6\r%Vo.2\\\xc8\xfe0\x83\x19\x95\xc2\xed\xed8\x83)G\x1bx[\x17Kn$ZH_=\rn\xd7\xc6\'\xc3gi\x18\xfc\xec\xe4\xca\x07\x0e0B0\x998\xd8\x11^:qu\xc1\xbf/e\x7f\x1eRN`M&\xf6a\xb2X\xfd\n\x8a\xfe\x03\x8cW\xe9\xf4\xbfyO\x82&R1\xe8vV\x96\x19\xd0P\xc6\x81\xa5\xedM\xa2`\xca1\x97\xc2g\xc2^j\x8c\xb5I\x02I\xf7\x9cd&\x98\xa1\xa0(\x89\x08\xd9\xcaH\x05w@\xe7\tg\xeet\xec}\xfa\x14\r\xa9\x90\xbcts\xd6\x8dC\xf0n\x1c\x82\xc8\x92\xd9\xda\xb6\xfd\xb9\x18\\"\xa1\x0b\xc7{\x03\x11\xff\xb8I_\xbd\xbds\x9f@?|\xf5\x93CI\x97\xdeV\x05\x16mg+\xc0\xbe\x8d\x8a\xda\x14\x94\x19\xb2GQp5 h\x8f\x83@\r\xb5\x030\x17\xfb\xb4\xc1\\\'H\x16\x86\xeb\xf6\x8cS\xaf\xa9\x89\xa6?5\x9fq\xc5(%\xaa\x95\x99\x8f\nm\xbcVW\xe3\'\xaa\xcc6\x12\x8b^$\x16\' \xb18\x12\x89\xc5\xb0b\xc7\x8f\x8b\xfc\x8bE\xe3\xedr\x88\r\xa9:\xe2{\x0b$\xd7\xcf\x82\xc7\xaa\x8b\xc7G\xd7CT\xc5\xad\x86eJ\xba8\xbbU^\x08\xbfu\xa8]\xdaX\x81\xd9"U\x1bg5eF\x1e\x13\xce\xa7\xbbR\x02O./$F\xf6\xc4\xaf\xe2I\x8c\xc3\xbc\xcb`\xe1\x07LG\xb0\x9enW\x1f\xfb\xea\n\xf1\xd7\xec38\xd0\xb9\x08\xdaK/\xa8\xc41&Rg\xadS\xa4\x9e[\xefd\xd8Q\xe8\xf0E\n\x19\x87\xec2K1:\x1e\x82\x0e\xd1J\x86\xdf\xc8D\x02O\x04\xbf\x8a\'{K\x12\x18\xb4\xf6\x19\xbel\xa0w*\x1a\x05\x85@~5\x98\xaf\xaa\xeb,\xfft\xf3\xfb\xaf\xa3\xdb\x8f\xc7|\x97\x80\xb0;\xbeI@\xd3\xa1`\xd9\xad\'\xd4\xb3\xa3\xaf)\x8a\x07\xea\x14\xaf\xf7\xdb\x19\xdf\xac-\x84\xa8P\xe0UK\xb21z9\x1a\xbdJ\x962\x89U\xa9bi\xf5\xdb\xce7\xe3UA\xfcj\x8f^\xe5\xf7\xde\xca\x18(WV\x1e\xf8*L_{\xe3\xd7\xf9\x7fhQ\x10Q\xb2\x12i\xa9a\x84\x89\x01\xd6\x94\xaf6(\xd9\x06\xaa\xd4]n\xfa\xf6\xd1\xf9r\xd5\xa9\xb8\xdc~\x9e\xe2\xe7o\xd69\xaa\x8a!n\xe75b\xafX,}\x8e\xb1\xde\x12\xa8<\xfa\xe1\xab\xbaDJ\x8a}\xc1{\xa9\x0c\x19H/\xcc\x90\xbd\x07\xc0\x01\xd4\xb0o\xf6\xb3\x8d"\xc6\xf8M"\x9e$\xde\x16\x00i\xa3Z\x7f!Z\xdf\xb6?\xb0\xfe\x19\xe1[\xdc\xa2R\xe2\xe3\xb5\x8a\x1e\xb7\xd4\x7f\x9b3\xf4W&\xa4;J\x13r\x8e\xe5\xaf\x84u\xd4\xd9\xa4\x88b\xc6Q\x9d\x99\xea\xad&D\x0e\xb9\xc6\x8f\xacM\xb8%\xe8_*i\xd8\xde\xde\xf7\xa8[\x8e\xa8&\xf4<_\x8b\xb0#6/\xff\xc0C7n\xf1.}\\\xfeX4\'\xd6\x8c\xfe\xe3\x8f\xff\x03|N\xcd\x12Nw\x00\x00' 31 | 32 | # print(b'\x1f\x8b'.decode('gbk')) 33 | # for div in soup.find_all('div', class_="detail_top"): 34 | # print(div) 35 | # print(soup.find('div', class_="detail_top").child) 36 | # for child in soup.find('div', class_="detail_top").children: 37 | # if (child.name == 'h3'): 38 | # print(child.string) 39 | # game_name = soup.find('div', class_="detail_top"). 40 | # attrs['data-num'] 41 | -------------------------------------------------------------------------------- /site/mybzz/test/TestPkl.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import tkinter 3 | import tkinter.font as tkFont 4 | import sys 5 | from random import shuffle 6 | 7 | from site.mybzz.util import DbUtil 8 | import jieba 9 | 10 | stop = [] 11 | 12 | 13 | def getStop(): 14 | f = open("../StopWords.txt", encoding="utf-8") 15 | jieba.load_userdict("c:/dict.txt") 16 | 17 | while True: 18 | line = f.readline().replace("\n", '') 19 | 20 | if not line: 21 | break 22 | stop.append(line) 23 | 24 | 25 | def add(f): 26 | global count 27 | if f: 28 | pos.append(lists[count]) 29 | else: 30 | neg.append(lists[count]) 31 | count += 1 32 | 33 | if count >= len(comments): 34 | frame.quit() 35 | return 36 | 37 | text.delete(0.0, tkinter.END) 38 | 39 | if comments[count]: 40 | while comments[count][5] < 30: 41 | neg.append(lists[count]) 42 | count += 1 43 | text.insert(0.0, comments[count][2]) 44 | text.insert(tkinter.END, '\n\n') 45 | text.insert(tkinter.END, '%s星' % (comments[count][5] / 10)) 46 | text.update() 47 | 48 | 49 | def read(): 50 | pos = pickle.load(open("pos_review.pkl", 'rb')) 51 | neg = pickle.load(open("neg_review.pkl", 'rb')) 52 | print(pos) 53 | print(neg) 54 | 55 | 56 | if __name__ == '__main__': 57 | getStop() 58 | pos = pickle.load(open("pos_review.pkl", 'rb')) 59 | neg = pickle.load(open("neg_review.pkl", 'rb')) 60 | print(len(pos)) 61 | print(len(neg)) 62 | 63 | while [] in pos: 64 | pos.pop(pos.index([])) 65 | while [] in neg: 66 | neg.pop(neg.index([])) 67 | 68 | comments = list(DbUtil.getAllResult("select * from comment limit 10000 offset 600000")) 69 | 70 | shuffle(comments) 71 | comments = comments[:100] 72 | lists = [] 73 | for comment in comments: 74 | list = [] 75 | result = jieba.cut(comment[2]) 76 | for word in result: 77 | if word not in stop and word != ' ': 78 | list.append(word) 79 | 80 | lists.append(list) 81 | count = 0 82 | 83 | frame = tkinter.Tk() 84 | ft = tkFont.Font(family='黑体', size=20, weight=tkFont.BOLD) 85 | 86 | text = tkinter.Text(frame, font=ft, height=10, width=30) 87 | 88 | g = tkinter.Button(frame, text="好", width=12, command=lambda: add(True)) 89 | b = tkinter.Button(frame, text="坏", width=12, command=lambda: add(False)) 90 | text.pack() 91 | g.pack() 92 | b.pack() 93 | 94 | text.insert(0.0, comments[0][2]) 95 | text.insert(tkinter.END, '\n\n') 96 | text.insert(tkinter.END, '%s星' % (comments[0][5] / 10)) 97 | 98 | tkinter.mainloop() 99 | 100 | # pickle.dump(pos, file=open('pos_review.pkl', 'wb')) 101 | # pickle.dump(neg, file=open('neg_review.pkl', 'wb')) 102 | -------------------------------------------------------------------------------- /site/mybzz/test/TestPred.py: -------------------------------------------------------------------------------- 1 | from random import shuffle 2 | 3 | from sklearn.externals import joblib 4 | 5 | from site.mybzz.test import NltkUtil 6 | from site.mybzz.util import DbUtil 7 | import jieba 8 | from site.mybzz.test import Ran 9 | 10 | stop = [] 11 | 12 | 13 | def getStop(): 14 | f = open("../StopWords.txt", encoding="utf-8") 15 | jieba.load_userdict("c:/dict.txt") 16 | 17 | while True: 18 | line = f.readline().replace("\n", '') 19 | 20 | if not line: 21 | break 22 | stop.append(line) 23 | 24 | 25 | def toDict(list): 26 | return dict([(word, True) for word in list if word in best_words]) 27 | 28 | 29 | def features(feature_extraction_method): 30 | Features = [] 31 | for i in lists: 32 | words = feature_extraction_method(i) # 为积极文本赋予"pos" 33 | Features.append(words) 34 | return Features 35 | 36 | 37 | if __name__ == '__main__': 38 | getStop() 39 | 40 | comments = list(DbUtil.getAllResult("select * from comment")) 41 | 42 | shuffle(comments) 43 | conn, cur = DbUtil.getConn() 44 | for i in range(0,1065000,1000): 45 | print(i) 46 | for comment in comments[i:i+1000]: 47 | print('UPDATE comment set comment_time = "%s" where id =%s' %(Ran.getTime(int(i/1000+1)%30+1),comment[0])) 48 | cur.execute('UPDATE comment set comment_time = "%s" where id =%s' %(Ran.getTime(int(i/1000+1)%30+1),comment[0])) 49 | conn.commit() 50 | DbUtil.close(conn,cur) 51 | # lists = [] 52 | # for comment in comments: 53 | # list = [] 54 | # result = jieba.cut(comment[2]) 55 | # for word in result: 56 | # if word not in stop and word != ' ': 57 | # list.append(word) 58 | # 59 | # lists.append(list) 60 | # 61 | # word_scores = NltkUtil.create_word_bigram_scores() 62 | # best_words = NltkUtil.find_best_words(word_scores, int(500)) 63 | # 64 | # dataset = features(toDict) 65 | # 66 | # clf = joblib.load('model.m') 67 | # 68 | # tags = clf.classify_many(dataset) 69 | # count = 0 70 | # conn, cur = DbUtil.getConn() 71 | # for tag in tags: 72 | # if (tag == 'pos'): 73 | # print('UPDATE comment set type = %d where id = %d;' % (1, comments[count][0])) 74 | # cur.execute('UPDATE comment set type = %d where id = %d;' % (1, comments[count][0])) 75 | # else: 76 | # print('UPDATE comment set type = %d where id = %d;' % (2, comments[count][0])) 77 | # cur.execute('UPDATE comment set type = %d where id = %d;' % (2, comments[count][0])) 78 | # count += 1 79 | # 80 | # conn.commit() 81 | # DbUtil.close(conn,cur) -------------------------------------------------------------------------------- /site/mybzz/test/TestQQ.py: -------------------------------------------------------------------------------- 1 | from urllib import request 2 | 3 | import zlib 4 | 5 | req = request.Request("http://pngweb.3g.qq.com/getSubRankList?sortType=22&categoryId=0&pageSize=20&" 6 | "startIndex=0&needCateList=1") 7 | 8 | req.add_header("Referer","http://qzs.qq.com/open/yyb/yyb_toplist/html/downtoplist.html?_ck_bid=3") 9 | req.add_header("Origin","http://qzs.qq.com") 10 | req.add_header("Accept","text/xml, text/html, application/xhtml+xml, image/png, text/plain, */*;q=0.8") 11 | req.add_header("User-Agent","Mozilla/5.0 (Linux; Android 5.1; MX4 Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.117 Mobile Safari/537.36/apiLevel/22/qqdownloader/3/ft_apiLevel/1_0_0_0") 12 | req.add_header("Accept-Language","zh-CN") 13 | req.add_header("Accept-Charset","utf-8, iso-8859-1, utf-16, *;q=0.7") 14 | req.add_header("Accept-Encoding","gzip") 15 | req.add_header("Connection","keep-alive") 16 | # req.add_header("Host","pngweb.3g.qq.com") 17 | # req.add_header("Cookie","accesstoken=; caller=13; guid=891204461307686912; imei=862095025228963; isforeground=1; logintype=NONE; openappid=0; openid=; qaccesstoken=null; qopenid=null; sid=; skey=; skey_datetime=; uin=; via=UNKNOWN_VIA; vkey=") 18 | req.add_header("Q-UA2","QV=2&PL=ADR&PR=TBS&PB=GE&VE=B1&VN=1.5.1.1065&CO=X5&COVN=025489&RF=PRI&PP=com.tencent.android.qqdownloader&PPVC=6522130&RL=1152*1920&MO= MX4 &DE=PHONE&OS=5.1&API=22&CHID=0&LCID=9422") 19 | req.add_header("Q-GUID","cee926d32c56b35b7aa4310013b788cb") 20 | req.add_header("Q-Auth","31045b957cf33acf31e40be2f3e71c5217597676a9729f1b") 21 | 22 | result = request.urlopen(req) 23 | # print(result.read()) 24 | print(zlib.decompress(result.read(), 16 + zlib.MAX_WBITS).decode()) 25 | -------------------------------------------------------------------------------- /site/mybzz/test/TestReadFile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | 4 | root = 'C:\结果' 5 | fileNameList = [] 6 | if __name__ == '__main__': 7 | for filename in os.walk(root): 8 | fileNameList = filename[2] 9 | 10 | dict = {} 11 | 12 | for file in fileNameList: 13 | f = open(root+'\\'+file) 14 | while True: 15 | line = f.readline() 16 | if not line: 17 | break 18 | key = line.split(' ')[0] 19 | count = line.split(' ')[1] 20 | dict[key] = count 21 | 22 | 23 | for (k,v) in dict.items(): 24 | print(k,v) 25 | f = open(root + '\\' + 'dict.txt', 'a') 26 | # f.write(dict) 27 | f.write((k+' '+v)) 28 | f.close() -------------------------------------------------------------------------------- /site/mybzz/test/TestSk.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn.externals import joblib 3 | from sklearn import datasets 4 | 5 | 6 | iris = datasets.load_iris() 7 | digit = datasets.load_digits() 8 | 9 | # print(digit.data) 10 | # print(digit.target) 11 | 12 | clf = svm.SVC(C=100,gamma=0.001) 13 | 14 | clf.fit(digit.data[:-1],digit.target[:-1]) 15 | 16 | clf.predict(digit.data[-1]) 17 | 18 | joblib.dump(clf,"model.pkl") 19 | 20 | print(clf.score(digit.data[:-1],digit.target[:-1])) 21 | # 22 | # result = clf.predict([2, 2]) # predict the target of testing samples 23 | # 24 | # print(clf.score(X,y)) 25 | # 26 | # print (result) # target 27 | # 28 | # print (clf.support_vectors_) # support vectors 29 | # 30 | # print (clf.support_) # indeices of support vectors 31 | # 32 | # print (clf.n_support_) # number of support vectors for each class -------------------------------------------------------------------------------- /site/mybzz/test/TestWordCloud.py: -------------------------------------------------------------------------------- 1 | import random 2 | from operator import itemgetter 3 | 4 | from pytagcloud import make_tags, create_tag_image 5 | from pytagcloud.colors import COLOR_SCHEMES 6 | from snownlp import SnowNLP 7 | 8 | path= 'c:/结果/dict.txt' 9 | if __name__ == '__main__': 10 | 11 | dict = {} 12 | 13 | f = open(path) 14 | while True: 15 | line = f.readline() 16 | 17 | if not line: 18 | break 19 | dict[line.split(' ')[0]] = int(line.split(' ')[1]) 20 | 21 | 22 | swd = sorted(dict.items(), key=itemgetter(1), reverse=True) 23 | swd = swd[1:50] 24 | 25 | for word in swd: 26 | print(word[0],SnowNLP(word[0]).sentiments) 27 | # print(word[0]) 28 | # print(swd) 29 | 30 | # dict ={} 31 | # 32 | # for (k,v) in swd: 33 | # # print(k,v) 34 | # dict[k] = v 35 | # print(dict) 36 | # tags = make_tags(swd, 37 | # minsize=30, 38 | # maxsize=130, 39 | # colors=random.choice(list(COLOR_SCHEMES.values()))) 40 | # 41 | # create_tag_image(tags, 42 | # 'tag_cloud.png', 43 | # background=(0, 0, 0, 255), 44 | # size=(1280, 900), 45 | # fontname='SimHei') 46 | # 47 | # print('having save file to dick') -------------------------------------------------------------------------------- /site/mybzz/test/model.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/test/model.m -------------------------------------------------------------------------------- /site/mybzz/test/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/test/model.pkl -------------------------------------------------------------------------------- /site/mybzz/test/neg_review.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/test/neg_review.pkl -------------------------------------------------------------------------------- /site/mybzz/test/pos_review.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WhiteDevilBan/CommentCrawler/2bc36084da3693d7f6b75295fbbcad216c42b2b8/site/mybzz/test/pos_review.pkl -------------------------------------------------------------------------------- /site/mybzz/util/BsUtil.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | from bs4 import BeautifulSoup 3 | from urllib import request 4 | import json 5 | 6 | 7 | def praseHtml(url): 8 | req = request.urlopen(url) 9 | return BeautifulSoup(req.read().decode('UTF-8'), "html.parser") 10 | 11 | 12 | def praseJson(url, timeout= 10): 13 | req = request.urlopen(url, timeout=timeout) 14 | 15 | data = req.read().decode() 16 | return json.loads(data) 17 | 18 | 19 | def praseGzipJson(url): 20 | req = request.urlopen(url) 21 | 22 | result = zlib.decompress(req.read(), 16 + zlib.MAX_WBITS).decode() 23 | return json.loads(result) 24 | 25 | 26 | def praseQQ(url): 27 | req = request.Request(url) 28 | req.add_header("Referer", "http://qzs.qq.com/open/yyb/yyb_toplist/html/downtoplist.html?_ck_bid=3") 29 | req.add_header("Origin", "http://qzs.qq.com") 30 | req.add_header("Accept", "text/xml, text/html, application/xhtml+xml, image/png, text/plain, */*;q=0.8") 31 | req.add_header("User-Agent", 32 | "Mozilla/5.0 (Linux; Android 5.1; MX4 Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.117 Mobile Safari/537.36/apiLevel/22/qqdownloader/3/ft_apiLevel/1_0_0_0") 33 | req.add_header("Accept-Language", "zh-CN") 34 | req.add_header("Accept-Charset", "utf-8, iso-8859-1, utf-16, *;q=0.7") 35 | req.add_header("Accept-Encoding", "gzip") 36 | req.add_header("Connection", "keep-alive") 37 | req.add_header("Host", "pngweb.3g.qq.com") 38 | req.add_header("Q-UA2", 39 | "QV=2&PL=ADR&PR=TBS&PB=GE&VE=B1&VN=1.5.1.1065&CO=X5&COVN=025489&RF=PRI&PP=com.tencent.android.qqdownloader&PPVC=6522130&RL=1152*1920&MO= MX4 &DE=PHONE&OS=5.1&API=22&CHID=0&LCID=9422") 40 | req.add_header("Q-GUID", "cee926d32c56b35b7aa4310013b788cb") 41 | req.add_header("Q-Auth", "31045b957cf33acf31e40be2f3e71c5217597676a9729f1b") 42 | 43 | result = request.urlopen(req) 44 | return json.loads(zlib.decompress(result.read(), 16 + zlib.MAX_WBITS).decode()) -------------------------------------------------------------------------------- /site/mybzz/util/DateUtil.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | 5 | TIMEFORMAT = "%Y-%m-%d %H:%M:%S" 6 | def currentTime(): 7 | return time.strftime(TIMEFORMAT, time.localtime(time.time())) 8 | 9 | def currentDate(): 10 | return datetime.date.today() 11 | 12 | def longToStrTime(t): 13 | return time.strftime(TIMEFORMAT, time.localtime(t)) -------------------------------------------------------------------------------- /site/mybzz/util/DbUtil.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | """ 3 | 数据库工具包 4 | """ 5 | 6 | def getConn(): 7 | """ 8 | 获取数据库连接和游标 9 | :return: 10 | """ 11 | conn = pymysql.connect(host="localhost", user="root", passwd="banban123", db="comment", port=3306, charset="utf8") 12 | cur = conn.cursor() 13 | return (conn,cur) 14 | 15 | def getAllResult(statement): 16 | """ 17 | 获取所有结果 18 | :param statement: 19 | :return: 20 | """ 21 | conn,cur = getConn() 22 | 23 | cur.execute(statement) 24 | return cur.fetchall() 25 | 26 | #获取一条结果 27 | def getOneResult(statement): 28 | """ 29 | 获取一条记录 30 | :param statement: 31 | :return: 32 | """ 33 | conn, cur = getConn() 34 | 35 | cur.execute(statement) 36 | return cur.fetchone() 37 | 38 | #关闭连接 39 | def close(conn,cur): 40 | """ 41 | 关闭连接和游标 42 | :param conn: 43 | :param cur: 44 | :return: 45 | """ 46 | cur.close 47 | conn.close --------------------------------------------------------------------------------