├── .gitattributes ├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml ├── workspace.xml └── yuqing_system.iml ├── .vscode └── settings.json ├── README.md ├── clean ├── 1.chinese_text_word_cloud.html ├── 2.chinese_text_analysis.html ├── 3.chinese_text_classifier.html ├── news.py ├── simhei.ttf ├── stopwords.txt ├── 出租车罢工.csv ├── 地陷事件.csv ├── 好一新大火.csv ├── 相识度计算.py └── 词频统计_LDA主题模型.py ├── dz_spider ├── __init__.py ├── __pycache__ │ └── __init__.cpython-36.pyc ├── dz_spider │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── middlewares.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── common.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── baidu.cpython-36.pyc │ │ ├── sogou.cpython-36.pyc │ │ └── toutiao.cpython-36.pyc │ │ ├── baidu.py │ │ ├── sogou.py │ │ └── toutiao.py ├── log │ └── app.log ├── run.py └── scrapy.cfg └── plan /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=Python 2 | *.css linguist-language=Python 3 | *.html linguist-language=Python -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 30 | 31 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 249 | 250 | 251 | 252 | 253 | 273 | 274 | 275 | 295 | 296 | 297 | 317 | 318 | 319 | 339 | 340 | 341 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 1564127592128 385 | 396 | 397 | 1565075263832 398 | 403 | 404 | 1565155883849 405 | 410 | 411 | 1565156238638 412 | 417 | 418 | 1565156425570 419 | 424 | 425 | 1565156525593 426 | 431 | 432 | 1565169400743 433 | 438 | 439 | 1565169459776 440 | 445 | 446 | 1565169482032 447 | 452 | 453 | 1565171930795 454 | 459 | 460 | 1565226568694 461 | 466 | 467 | 1565236191217 468 | 473 | 474 | 1565236211698 475 | 480 | 481 | 1565236359421 482 | 487 | 490 | 491 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 555 | 556 | 557 | 558 | 559 | file://$PROJECT_DIR$/clean/相识度计算.py 560 | 116 561 | 563 | 564 | file://$PROJECT_DIR$/clean/词频统计_LDA主题模型.py 565 | 72 566 | 568 | 569 | file://$PROJECT_DIR$/clean/news.py 570 | 53 571 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | str(e) 580 | Python 581 | CODE_FRAGMENT 582 | 583 | 584 | "MaxRetryError" in str(e) 585 | Python 586 | CODE_FRAGMENT 587 | 588 | 589 | data_df[:10]["count"].values*1000 590 | Python 591 | CODE_FRAGMENT 592 | 593 | 594 | "TimeoutError" in str(e) 595 | Python 596 | CODE_FRAGMENT 597 | 598 | 599 | data_df["index"].values 600 | Python 601 | CODE_FRAGMENT 602 | 603 | 604 | numpy.array(data_df["count"].values) 605 | Python 606 | CODE_FRAGMENT 607 | 608 | 609 | numpy.array(data_df["count"].Number) 610 | Python 611 | CODE_FRAGMENT 612 | 613 | 614 | sorted(wors.items(),key=lambda x:x[1],reverse=True) 615 | Python 616 | CODE_FRAGMENT 617 | 618 | 619 | sorted(wors.items(),key=lambda x:x[1],reverse=False) 620 | Python 621 | CODE_FRAGMENT 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | -------------------------------------------------------------------------------- /.idea/yuqing_system.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 14 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/Users/yuanlang/work/python/anaconda2/envs/python3_6/bin/python" 3 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # 舆情线下爬虫设计 3 | 4 | ### 安装 scrapy 5 | ``` 6 | $ scrapy startproject dz_spider 7 | $ cd dz_spider 8 | $ scrapy genspider baidu www.baicu.com 9 | ``` 10 | 11 | ### 使用spiderkeeper管理scrapy项目 12 | ``` 13 | 略 14 | ``` 15 | 16 | ### 新闻正文提取 Article 模块（clean/news.py） 17 | ### 新闻主题分类 (clean/关键字提取.py) -------------------------------------------------------------------------------- /clean/news.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 新闻内容提取 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: news.py 8 | @author: yuanlang 9 | @time: 2019-08-06 16:04 10 | --------------------------------------- 11 | """ 12 | import time 13 | from newspaper import Article 14 | import requests 15 | import pymysql 16 | import threading 17 | from queue import Queue 18 | 19 | conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8") 20 | cursor = conn.cursor() 21 | q=Queue() 22 | 23 | 24 | def download(url): 25 | 26 | try: 27 | print(f"fetch url --------> {url}") 28 | news = Article(url, language='zh') 29 | reponse = requests.get(url, verify=False,timeout=3) 30 | if reponse.status_code==404 or reponse.status_code==503: 31 | sql = "update seed set status=-1 where url='" + url + "'" 32 | print(sql) 33 | cursor.execute(sql) 34 | conn.commit() 35 | return 36 | news.set_html(reponse.content) 37 | news.parse() # 再解析 38 | text = news.text 39 | if text == "": 40 | sql = "update seed set status=-2 where url='" + url + "'" 41 | print(sql) 42 | cursor.execute(sql) 43 | conn.commit() 44 | return 45 | sql="insert into context(url,content) values('"+url+"','"+text+"')" 46 | print(sql) 47 | cursor.execute(sql) 48 | sql = "update seed set status=1 where url='" + url+"'" 49 | print(sql) 50 | cursor.execute(sql) 51 | conn.commit() 52 | except Exception as e: 53 | print("exception"+str(repr(e))) 54 | if "TimeoutError" in str(e) or "HTTPSConnectionPool" in str(e) or "Exceeded 30 redirects" in str(e) \ 55 | or "Max retries" in str(e) or "HTTPConnectionPool" in str(e) or "Data too long" in str(e): 56 | sql = "update seed set status=-1 where url='" + url + "'" 57 | print(sql) 58 | cursor.execute(sql) 59 | conn.commit() 60 | 61 | 62 | def spider(): 63 | 64 | 65 | while True: 66 | cursor.execute("select url from seed where status = 0 limit 1") 67 | items = cursor.fetchall() 68 | for item in items: 69 | q.put(item[0]) 70 | # result=[] 71 | # for i in range(20): 72 | # url = q.get() 73 | # t=threading.Thread(target=download,args=(url,)) 74 | # t.start() 75 | # time.sleep(1) 76 | # result.append(t) 77 | # for t in result: 78 | # t.join() 79 | url = q.get() 80 | download(url=url) 81 | # time.sleep(1) 82 | 83 | if __name__ == "__main__": 84 | spider() -------------------------------------------------------------------------------- /clean/simhei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/clean/simhei.ttf -------------------------------------------------------------------------------- /clean/stopwords.txt: -------------------------------------------------------------------------------- 1 | ! 2 | " 3 | # 4 | $ 5 | % 6 | & 7 | ' 8 | ( 9 | ) 10 | * 11 | + 12 | , 13 | - 14 | -- 15 | . 16 | .. 17 | ... 18 | ...... 19 | ................... 20 | ./ 21 | .一 22 | 记者 23 | 数 24 | 年 25 | 月 26 | 日 27 | 时 28 | 分 29 | 秒 30 | / 31 | // 32 | 0 33 | 1 34 | 2 35 | 3 36 | 4 37 | 5 38 | 6 39 | 7 40 | 8 41 | 9 42 | : 43 | :// 44 | :: 45 | ; 46 | < 47 | = 48 | > 49 | >> 50 | ? 51 | @ 52 | A 53 | Lex 54 | [ 55 | \ 56 | ] 57 | 【 58 | 】 59 | ^ 60 | _ 61 | ` 62 | exp 63 | sub 64 | sup 65 | | 66 | } 67 | ~ 68 | ~~~~ 69 | · 70 | × 71 | ××× 72 | Δ 73 | Ψ 74 | γ 75 | μ 76 | φ 77 | φ． 78 | В 79 | — 80 | —— 81 | ——— 82 | ‘ 83 | ’ 84 | ’‘ 85 | “ 86 | ” 87 | ”， 88 | … 89 | …… 90 | …………………………………………………③ 91 | ′∈ 92 | ′｜ 93 | ℃ 94 | Ⅲ 95 | ↑ 96 | → 97 | ∈［ 98 | ∪φ∈ 99 | ≈ 100 | ① 101 | ② 102 | ②ｃ 103 | ③ 104 | ③］ 105 | ④ 106 | ⑤ 107 | ⑥ 108 | ⑦ 109 | ⑧ 110 | ⑨ 111 | ⑩ 112 | ── 113 | ■ 114 | ▲ 115 | 　 116 | 、 117 | 。 118 | 〈 119 | 〉 120 | 《 121 | 》 122 | 》）， 123 | 」 124 | 『 125 | 』 126 | 〔 127 | 〕 128 | 〕〔 129 | ㈧ 130 | 一 131 | 一. 132 | 一一 133 | 一下 134 | 一个 135 | 一些 136 | 一何 137 | 一切 138 | 一则 139 | 一则通过 140 | 一天 141 | 一定 142 | 一方面 143 | 一旦 144 | 一时 145 | 一来 146 | 一样 147 | 一次 148 | 一片 149 | 一番 150 | 一直 151 | 一致 152 | 一般 153 | 一起 154 | 一转眼 155 | 一边 156 | 一面 157 | 男子 158 | 女子 159 | 七 160 | 万一 161 | 三 162 | 三天两头 163 | 三番两次 164 | 三番五次 165 | 上 166 | 上下 167 | 上升 168 | 上去 169 | 上来 170 | 上述 171 | 上面 172 | 下 173 | 下列 174 | 下去 175 | 下来 176 | 下面 177 | 不 178 | 不一 179 | 不下 180 | 不久 181 | 不了 182 | 不亦乐乎 183 | 不仅 184 | 不仅...而且 185 | 不仅仅 186 | 不仅仅是 187 | 不会 188 | 不但 189 | 不但...而且 190 | 不光 191 | 不免 192 | 不再 193 | 不力 194 | 不单 195 | 不变 196 | 不只 197 | 不可 198 | 不可开交 199 | 不可抗拒 200 | 不同 201 | 不外 202 | 不外乎 203 | 不够 204 | 不大 205 | 不如 206 | 不妨 207 | 不定 208 | 不对 209 | 不少 210 | 不尽 211 | 不尽然 212 | 不巧 213 | 不已 214 | 不常 215 | 不得 216 | 不得不 217 | 不得了 218 | 不得已 219 | 不必 220 | 不怎么 221 | 不怕 222 | 不惟 223 | 不成 224 | 不拘 225 | 不择手段 226 | 不敢 227 | 不料 228 | 不断 229 | 不日 230 | 不时 231 | 不是 232 | 不曾 233 | 不止 234 | 不止一次 235 | 不比 236 | 不消 237 | 不满 238 | 不然 239 | 不然的话 240 | 不特 241 | 不独 242 | 不由得 243 | 不知不觉 244 | 不管 245 | 不管怎样 246 | 不经意 247 | 不胜 248 | 不能 249 | 不能不 250 | 不至于 251 | 不若 252 | 不要 253 | 不论 254 | 不起 255 | 不足 256 | 不过 257 | 不迭 258 | 不问 259 | 不限 260 | 与 261 | 与其 262 | 与其说 263 | 与否 264 | 与此同时 265 | 专门 266 | 且 267 | 且不说 268 | 且说 269 | 两者 270 | 严格 271 | 严重 272 | 个 273 | 个人 274 | 个别 275 | 中小 276 | 中间 277 | 丰富 278 | 串行 279 | 临 280 | 临到 281 | 为 282 | 为主 283 | 为了 284 | 为什么 285 | 为什麽 286 | 为何 287 | 为止 288 | 为此 289 | 为着 290 | 主张 291 | 主要 292 | 举凡 293 | 举行 294 | 乃 295 | 乃至 296 | 乃至于 297 | 么 298 | 之 299 | 之一 300 | 之前 301 | 之后 302 | 之後 303 | 之所以 304 | 之类 305 | 乌乎 306 | 乎 307 | 乒 308 | 乘 309 | 乘势 310 | 乘机 311 | 乘胜 312 | 乘虚 313 | 乘隙 314 | 九 315 | 也 316 | 也好 317 | 也就是说 318 | 也是 319 | 也罢 320 | 了 321 | 了解 322 | 争取 323 | 二 324 | 二来 325 | 二话不说 326 | 二话没说 327 | 于 328 | 于是 329 | 于是乎 330 | 云云 331 | 云尔 332 | 互 333 | 互相 334 | 五 335 | 些 336 | 交口 337 | 亦 338 | 产生 339 | 亲口 340 | 亲手 341 | 亲眼 342 | 亲自 343 | 亲身 344 | 人 345 | 人人 346 | 人们 347 | 人家 348 | 人民 349 | 什么 350 | 什么样 351 | 什麽 352 | 仅 353 | 仅仅 354 | 今 355 | 今后 356 | 今天 357 | 今年 358 | 今後 359 | 介于 360 | 仍 361 | 仍旧 362 | 仍然 363 | 从 364 | 从不 365 | 从严 366 | 从中 367 | 从事 368 | 从今以后 369 | 从优 370 | 从古到今 371 | 从古至今 372 | 从头 373 | 从宽 374 | 从小 375 | 从新 376 | 从无到有 377 | 从早到晚 378 | 从未 379 | 从来 380 | 从此 381 | 从此以后 382 | 从而 383 | 从轻 384 | 从速 385 | 从重 386 | 他 387 | 他人 388 | 他们 389 | 他是 390 | 他的 391 | 代替 392 | 以 393 | 以上 394 | 以下 395 | 以为 396 | 以便 397 | 以免 398 | 以前 399 | 以及 400 | 以后 401 | 以外 402 | 以後 403 | 以故 404 | 以期 405 | 以来 406 | 以至 407 | 以至于 408 | 以致 409 | 们 410 | 任 411 | 任何 412 | 任凭 413 | 任务 414 | 企图 415 | 伙同 416 | 会 417 | 伟大 418 | 传 419 | 传说 420 | 传闻 421 | 似乎 422 | 似的 423 | 但 424 | 但凡 425 | 但愿 426 | 但是 427 | 何 428 | 何乐而不为 429 | 何以 430 | 何况 431 | 何处 432 | 何妨 433 | 何尝 434 | 何必 435 | 何时 436 | 何止 437 | 何苦 438 | 何须 439 | 余外 440 | 作为 441 | 你 442 | 你们 443 | 你是 444 | 你的 445 | 使 446 | 使得 447 | 使用 448 | 例如 449 | 依 450 | 依据 451 | 依照 452 | 依靠 453 | 便 454 | 便于 455 | 促进 456 | 保持 457 | 保管 458 | 保险 459 | 俺 460 | 俺们 461 | 倍加 462 | 倍感 463 | 倒不如 464 | 倒不如说 465 | 倒是 466 | 倘 467 | 倘使 468 | 倘或 469 | 倘然 470 | 倘若 471 | 借 472 | 借以 473 | 借此 474 | 假使 475 | 假如 476 | 假若 477 | 偏偏 478 | 做到 479 | 偶尔 480 | 偶而 481 | 傥然 482 | 像 483 | 儿 484 | 允许 485 | 元／吨 486 | 充其极 487 | 充其量 488 | 充分 489 | 先不先 490 | 先后 491 | 先後 492 | 先生 493 | 光 494 | 光是 495 | 全体 496 | 全力 497 | 全年 498 | 全然 499 | 全身心 500 | 全部 501 | 全都 502 | 全面 503 | 八 504 | 八成 505 | 公然 506 | 六 507 | 兮 508 | 共 509 | 共同 510 | 共总 511 | 关于 512 | 其 513 | 其一 514 | 其中 515 | 其二 516 | 其他 517 | 其余 518 | 其后 519 | 其它 520 | 其实 521 | 其次 522 | 具体 523 | 具体地说 524 | 具体来说 525 | 具体说来 526 | 具有 527 | 兼之 528 | 内 529 | 再 530 | 再其次 531 | 再则 532 | 再有 533 | 再次 534 | 再者 535 | 再者说 536 | 再说 537 | 冒 538 | 冲 539 | 决不 540 | 决定 541 | 决非 542 | 况且 543 | 准备 544 | 凑巧 545 | 凝神 546 | 几 547 | 几乎 548 | 几度 549 | 几时 550 | 几番 551 | 几经 552 | 凡 553 | 凡是 554 | 凭 555 | 凭借 556 | 出 557 | 出于 558 | 出去 559 | 出来 560 | 出现 561 | 分别 562 | 分头 563 | 分期 564 | 分期分批 565 | 切 566 | 切不可 567 | 切切 568 | 切勿 569 | 切莫 570 | 则 571 | 则甚 572 | 刚 573 | 刚好 574 | 刚巧 575 | 刚才 576 | 初 577 | 别 578 | 别人 579 | 别处 580 | 别是 581 | 别的 582 | 别管 583 | 别说 584 | 到 585 | 到了儿 586 | 到处 587 | 到头 588 | 到头来 589 | 到底 590 | 到目前为止 591 | 前后 592 | 前此 593 | 前者 594 | 前进 595 | 前面 596 | 加上 597 | 加之 598 | 加以 599 | 加入 600 | 加强 601 | 动不动 602 | 动辄 603 | 勃然 604 | 匆匆 605 | 十分 606 | 千 607 | 千万 608 | 千万千万 609 | 半 610 | 单 611 | 单单 612 | 单纯 613 | 即 614 | 即令 615 | 即使 616 | 即便 617 | 即刻 618 | 即如 619 | 即将 620 | 即或 621 | 即是说 622 | 即若 623 | 却 624 | 却不 625 | 历 626 | 原来 627 | 去 628 | 又 629 | 又及 630 | 及 631 | 及其 632 | 及时 633 | 及至 634 | 双方 635 | 反之 636 | 反之亦然 637 | 反之则 638 | 反倒 639 | 反倒是 640 | 反应 641 | 反手 642 | 反映 643 | 反而 644 | 反过来 645 | 反过来说 646 | 取得 647 | 取道 648 | 受到 649 | 变成 650 | 古来 651 | 另 652 | 另一个 653 | 另一方面 654 | 另外 655 | 另悉 656 | 另方面 657 | 另行 658 | 只 659 | 只当 660 | 只怕 661 | 只是 662 | 只有 663 | 只消 664 | 只要 665 | 只限 666 | 叫 667 | 叫做 668 | 召开 669 | 叮咚 670 | 叮当 671 | 可 672 | 可以 673 | 可好 674 | 可是 675 | 可能 676 | 可见 677 | 各 678 | 各个 679 | 各人 680 | 各位 681 | 各地 682 | 各式 683 | 各种 684 | 各级 685 | 各自 686 | 合理 687 | 同 688 | 同一 689 | 同时 690 | 同样 691 | 后 692 | 后来 693 | 后者 694 | 后面 695 | 向 696 | 向使 697 | 向着 698 | 吓 699 | 吗 700 | 否则 701 | 吧 702 | 吧哒 703 | 吱 704 | 呀 705 | 呃 706 | 呆呆地 707 | 呐 708 | 呕 709 | 呗 710 | 呜 711 | 呜呼 712 | 呢 713 | 周围 714 | 呵 715 | 呵呵 716 | 呸 717 | 呼哧 718 | 呼啦 719 | 咋 720 | 和 721 | 咚 722 | 咦 723 | 咧 724 | 咱 725 | 咱们 726 | 咳 727 | 哇 728 | 哈 729 | 哈哈 730 | 哉 731 | 哎 732 | 哎呀 733 | 哎哟 734 | 哗 735 | 哗啦 736 | 哟 737 | 哦 738 | 哩 739 | 哪 740 | 哪个 741 | 哪些 742 | 哪儿 743 | 哪天 744 | 哪年 745 | 哪怕 746 | 哪样 747 | 哪边 748 | 哪里 749 | 哼 750 | 哼唷 751 | 唉 752 | 唯有 753 | 啊 754 | 啊呀 755 | 啊哈 756 | 啊哟 757 | 啐 758 | 啥 759 | 啦 760 | 啪达 761 | 啷当 762 | 喀 763 | 喂 764 | 喏 765 | 喔唷 766 | 喽 767 | 嗡 768 | 嗡嗡 769 | 嗬 770 | 嗯 771 | 嗳 772 | 嘎 773 | 嘎嘎 774 | 嘎登 775 | 嘘 776 | 嘛 777 | 嘻 778 | 嘿 779 | 嘿嘿 780 | 四 781 | 因 782 | 因为 783 | 因了 784 | 因此 785 | 因着 786 | 因而 787 | 固 788 | 固然 789 | 在 790 | 在下 791 | 在于 792 | 地 793 | 均 794 | 坚决 795 | 坚持 796 | 基于 797 | 基本 798 | 基本上 799 | 处在 800 | 处处 801 | 处理 802 | 复杂 803 | 多 804 | 多么 805 | 多亏 806 | 多多 807 | 多多少少 808 | 多多益善 809 | 多少 810 | 多年前 811 | 多年来 812 | 多数 813 | 多次 814 | 够瞧的 815 | 大 816 | 大不了 817 | 大举 818 | 大事 819 | 大体 820 | 大体上 821 | 大凡 822 | 大力 823 | 大多 824 | 大多数 825 | 大大 826 | 大家 827 | 大张旗鼓 828 | 大批 829 | 大抵 830 | 大概 831 | 大略 832 | 大约 833 | 大致 834 | 大都 835 | 大量 836 | 大面儿上 837 | 失去 838 | 奇 839 | 奈 840 | 奋勇 841 | 她 842 | 她们 843 | 她是 844 | 她的 845 | 好 846 | 好在 847 | 好的 848 | 好象 849 | 如 850 | 如上 851 | 如上所述 852 | 如下 853 | 如今 854 | 如何 855 | 如其 856 | 如前所述 857 | 如同 858 | 如常 859 | 如是 860 | 如期 861 | 如果 862 | 如次 863 | 如此 864 | 如此等等 865 | 如若 866 | 始而 867 | 姑且 868 | 存在 869 | 存心 870 | 孰料 871 | 孰知 872 | 宁 873 | 宁可 874 | 宁愿 875 | 宁肯 876 | 它 877 | 它们 878 | 它们的 879 | 它是 880 | 它的 881 | 安全 882 | 完全 883 | 完成 884 | 定 885 | 实现 886 | 实际 887 | 宣布 888 | 容易 889 | 密切 890 | 对 891 | 对于 892 | 对应 893 | 对待 894 | 对方 895 | 对比 896 | 将 897 | 将才 898 | 将要 899 | 将近 900 | 小 901 | 少数 902 | 尔 903 | 尔后 904 | 尔尔 905 | 尔等 906 | 尚且 907 | 尤其 908 | 就 909 | 就地 910 | 就是 911 | 就是了 912 | 就是说 913 | 就此 914 | 就算 915 | 就要 916 | 尽 917 | 尽可能 918 | 尽如人意 919 | 尽心尽力 920 | 尽心竭力 921 | 尽快 922 | 尽早 923 | 尽然 924 | 尽管 925 | 尽管如此 926 | 尽量 927 | 局外 928 | 居然 929 | 届时 930 | 属于 931 | 屡 932 | 屡屡 933 | 屡次 934 | 屡次三番 935 | 岂 936 | 岂但 937 | 岂止 938 | 岂非 939 | 川流不息 940 | 左右 941 | 巨大 942 | 巩固 943 | 差一点 944 | 差不多 945 | 己 946 | 已 947 | 已矣 948 | 已经 949 | 巴 950 | 巴巴 951 | 带 952 | 帮助 953 | 常 954 | 常常 955 | 常言说 956 | 常言说得好 957 | 常言道 958 | 平素 959 | 年复一年 960 | 并 961 | 并不 962 | 并不是 963 | 并且 964 | 并排 965 | 并无 966 | 并没 967 | 并没有 968 | 并肩 969 | 并非 970 | 广大 971 | 广泛 972 | 应当 973 | 应用 974 | 应该 975 | 庶乎 976 | 庶几 977 | 开外 978 | 开始 979 | 开展 980 | 引起 981 | 弗 982 | 弹指之间 983 | 强烈 984 | 强调 985 | 归 986 | 归根到底 987 | 归根结底 988 | 归齐 989 | 当 990 | 当下 991 | 当中 992 | 当儿 993 | 当前 994 | 当即 995 | 当口儿 996 | 当地 997 | 当场 998 | 当头 999 | 当庭 1000 | 当时 1001 | 当然 1002 | 当真 1003 | 当着 1004 | 形成 1005 | 彻夜 1006 | 彻底 1007 | 彼 1008 | 彼时 1009 | 彼此 1010 | 往 1011 | 往往 1012 | 待 1013 | 待到 1014 | 很 1015 | 很多 1016 | 很少 1017 | 後来 1018 | 後面 1019 | 得 1020 | 得了 1021 | 得出 1022 | 得到 1023 | 得天独厚 1024 | 得起 1025 | 心里 1026 | 必 1027 | 必定 1028 | 必将 1029 | 必然 1030 | 必要 1031 | 必须 1032 | 快 1033 | 快要 1034 | 忽地 1035 | 忽然 1036 | 怎 1037 | 怎么 1038 | 怎么办 1039 | 怎么样 1040 | 怎奈 1041 | 怎样 1042 | 怎麽 1043 | 怕 1044 | 急匆匆 1045 | 怪 1046 | 怪不得 1047 | 总之 1048 | 总是 1049 | 总的来看 1050 | 总的来说 1051 | 总的说来 1052 | 总结 1053 | 总而言之 1054 | 恍然 1055 | 恐怕 1056 | 恰似 1057 | 恰好 1058 | 恰如 1059 | 恰巧 1060 | 恰恰 1061 | 恰恰相反 1062 | 恰逢 1063 | 您 1064 | 您们 1065 | 您是 1066 | 惟其 1067 | 惯常 1068 | 意思 1069 | 愤然 1070 | 愿意 1071 | 慢说 1072 | 成为 1073 | 成年 1074 | 成年累月 1075 | 成心 1076 | 我 1077 | 我们 1078 | 我是 1079 | 我的 1080 | 或 1081 | 或则 1082 | 或多或少 1083 | 或是 1084 | 或曰 1085 | 或者 1086 | 或许 1087 | 战斗 1088 | 截然 1089 | 截至 1090 | 所 1091 | 所以 1092 | 所在 1093 | 所幸 1094 | 所有 1095 | 所谓 1096 | 才 1097 | 才能 1098 | 扑通 1099 | 打 1100 | 打从 1101 | 打开天窗说亮话 1102 | 扩大 1103 | 把 1104 | 抑或 1105 | 抽冷子 1106 | 拦腰 1107 | 拿 1108 | 按 1109 | 按时 1110 | 按期 1111 | 按照 1112 | 按理 1113 | 按说 1114 | 挨个 1115 | 挨家挨户 1116 | 挨次 1117 | 挨着 1118 | 挨门挨户 1119 | 挨门逐户 1120 | 换句话说 1121 | 换言之 1122 | 据 1123 | 据实 1124 | 据悉 1125 | 据我所知 1126 | 据此 1127 | 据称 1128 | 据说 1129 | 掌握 1130 | 接下来 1131 | 接着 1132 | 接著 1133 | 接连不断 1134 | 放量 1135 | 故 1136 | 故意 1137 | 故此 1138 | 故而 1139 | 敞开儿 1140 | 敢 1141 | 敢于 1142 | 敢情 1143 | 数/ 1144 | 整个 1145 | 断然 1146 | 方 1147 | 方便 1148 | 方才 1149 | 方能 1150 | 方面 1151 | 旁人 1152 | 无 1153 | 无宁 1154 | 无法 1155 | 无论 1156 | 既 1157 | 既...又 1158 | 既往 1159 | 既是 1160 | 既然 1161 | 日复一日 1162 | 日渐 1163 | 日益 1164 | 日臻 1165 | 日见 1166 | 时候 1167 | 昂然 1168 | 明显 1169 | 明确 1170 | 是 1171 | 是不是 1172 | 是以 1173 | 是否 1174 | 是的 1175 | 显然 1176 | 显著 1177 | 普通 1178 | 普遍 1179 | 暗中 1180 | 暗地里 1181 | 暗自 1182 | 更 1183 | 更为 1184 | 更加 1185 | 更进一步 1186 | 曾 1187 | 曾经 1188 | 替 1189 | 替代 1190 | 最 1191 | 最后 1192 | 最大 1193 | 最好 1194 | 最後 1195 | 最近 1196 | 最高 1197 | 有 1198 | 有些 1199 | 有关 1200 | 有利 1201 | 有力 1202 | 有及 1203 | 有所 1204 | 有效 1205 | 有时 1206 | 有点 1207 | 有的 1208 | 有的是 1209 | 有着 1210 | 有著 1211 | 望 1212 | 朝 1213 | 朝着 1214 | 末##末 1215 | 本 1216 | 本人 1217 | 本地 1218 | 本着 1219 | 本身 1220 | 权时 1221 | 来 1222 | 来不及 1223 | 来得及 1224 | 来看 1225 | 来着 1226 | 来自 1227 | 来讲 1228 | 来说 1229 | 极 1230 | 极为 1231 | 极了 1232 | 极其 1233 | 极力 1234 | 极大 1235 | 极度 1236 | 极端 1237 | 构成 1238 | 果然 1239 | 果真 1240 | 某 1241 | 某个 1242 | 某些 1243 | 某某 1244 | 根据 1245 | 根本 1246 | 格外 1247 | 梆 1248 | 概 1249 | 次第 1250 | 欢迎 1251 | 欤 1252 | 正值 1253 | 正在 1254 | 正如 1255 | 正巧 1256 | 正常 1257 | 正是 1258 | 此 1259 | 此中 1260 | 此后 1261 | 此地 1262 | 此处 1263 | 此外 1264 | 此时 1265 | 此次 1266 | 此间 1267 | 殆 1268 | 毋宁 1269 | 每 1270 | 每个 1271 | 每天 1272 | 每年 1273 | 每当 1274 | 每时每刻 1275 | 每每 1276 | 每逢 1277 | 比 1278 | 比及 1279 | 比如 1280 | 比如说 1281 | 比方 1282 | 比照 1283 | 比起 1284 | 比较 1285 | 毕竟 1286 | 毫不 1287 | 毫无 1288 | 毫无例外 1289 | 毫无保留地 1290 | 汝 1291 | 沙沙 1292 | 没 1293 | 没奈何 1294 | 没有 1295 | 沿 1296 | 沿着 1297 | 注意 1298 | 活 1299 | 深入 1300 | 清楚 1301 | 满 1302 | 满足 1303 | 漫说 1304 | 焉 1305 | 然 1306 | 然则 1307 | 然后 1308 | 然後 1309 | 然而 1310 | 照 1311 | 照着 1312 | 牢牢 1313 | 特别是 1314 | 特殊 1315 | 特点 1316 | 犹且 1317 | 犹自 1318 | 独 1319 | 独自 1320 | 猛然 1321 | 猛然间 1322 | 率尔 1323 | 率然 1324 | 现代 1325 | 现在 1326 | 理应 1327 | 理当 1328 | 理该 1329 | 瑟瑟 1330 | 甚且 1331 | 甚么 1332 | 甚或 1333 | 甚而 1334 | 甚至 1335 | 甚至于 1336 | 用 1337 | 用来 1338 | 甫 1339 | 甭 1340 | 由 1341 | 由于 1342 | 由是 1343 | 由此 1344 | 由此可见 1345 | 略 1346 | 略为 1347 | 略加 1348 | 略微 1349 | 白 1350 | 白白 1351 | 的 1352 | 的确 1353 | 的话 1354 | 皆可 1355 | 目前 1356 | 直到 1357 | 直接 1358 | 相似 1359 | 相信 1360 | 相反 1361 | 相同 1362 | 相对 1363 | 相对而言 1364 | 相应 1365 | 相当 1366 | 相等 1367 | 省得 1368 | 看 1369 | 看上去 1370 | 看出 1371 | 看到 1372 | 看来 1373 | 看样子 1374 | 看看 1375 | 看见 1376 | 看起来 1377 | 真是 1378 | 真正 1379 | 眨眼 1380 | 着 1381 | 着呢 1382 | 矣 1383 | 矣乎 1384 | 矣哉 1385 | 知道 1386 | 砰 1387 | 确定 1388 | 碰巧 1389 | 社会主义 1390 | 离 1391 | 种 1392 | 积极 1393 | 移动 1394 | 究竟 1395 | 穷年累月 1396 | 突出 1397 | 突然 1398 | 窃 1399 | 立 1400 | 立刻 1401 | 立即 1402 | 立地 1403 | 立时 1404 | 立马 1405 | 竟 1406 | 竟然 1407 | 竟而 1408 | 第 1409 | 第二 1410 | 等 1411 | 等到 1412 | 等等 1413 | 策略地 1414 | 简直 1415 | 简而言之 1416 | 简言之 1417 | 管 1418 | 类如 1419 | 粗 1420 | 精光 1421 | 紧接着 1422 | 累年 1423 | 累次 1424 | 纯 1425 | 纯粹 1426 | 纵 1427 | 纵令 1428 | 纵使 1429 | 纵然 1430 | 练习 1431 | 组成 1432 | 经 1433 | 经常 1434 | 经过 1435 | 结合 1436 | 结果 1437 | 给 1438 | 绝 1439 | 绝不 1440 | 绝对 1441 | 绝非 1442 | 绝顶 1443 | 继之 1444 | 继后 1445 | 继续 1446 | 继而 1447 | 维持 1448 | 综上所述 1449 | 缕缕 1450 | 罢了 1451 | 老 1452 | 老大 1453 | 老是 1454 | 老老实实 1455 | 考虑 1456 | 者 1457 | 而 1458 | 而且 1459 | 而况 1460 | 而又 1461 | 而后 1462 | 而外 1463 | 而已 1464 | 而是 1465 | 而言 1466 | 而论 1467 | 联系 1468 | 联袂 1469 | 背地里 1470 | 背靠背 1471 | 能 1472 | 能否 1473 | 能够 1474 | 腾 1475 | 自 1476 | 自个儿 1477 | 自从 1478 | 自各儿 1479 | 自后 1480 | 自家 1481 | 自己 1482 | 自打 1483 | 自身 1484 | 臭 1485 | 至 1486 | 至于 1487 | 至今 1488 | 至若 1489 | 致 1490 | 般的 1491 | 良好 1492 | 若 1493 | 若夫 1494 | 若是 1495 | 若果 1496 | 若非 1497 | 范围 1498 | 莫 1499 | 莫不 1500 | 莫不然 1501 | 莫如 1502 | 莫若 1503 | 莫非 1504 | 获得 1505 | 藉以 1506 | 虽 1507 | 虽则 1508 | 虽然 1509 | 虽说 1510 | 蛮 1511 | 行为 1512 | 行动 1513 | 表明 1514 | 表示 1515 | 被 1516 | 要 1517 | 要不 1518 | 要不是 1519 | 要不然 1520 | 要么 1521 | 要是 1522 | 要求 1523 | 见 1524 | 规定 1525 | 觉得 1526 | 譬喻 1527 | 譬如 1528 | 认为 1529 | 认真 1530 | 认识 1531 | 让 1532 | 许多 1533 | 论 1534 | 论说 1535 | 设使 1536 | 设或 1537 | 设若 1538 | 诚如 1539 | 诚然 1540 | 话说 1541 | 该 1542 | 该当 1543 | 说明 1544 | 说来 1545 | 说说 1546 | 请勿 1547 | 诸 1548 | 诸位 1549 | 诸如 1550 | 谁 1551 | 谁人 1552 | 谁料 1553 | 谁知 1554 | 谨 1555 | 豁然 1556 | 贼死 1557 | 赖以 1558 | 赶 1559 | 赶快 1560 | 赶早不赶晚 1561 | 起 1562 | 起先 1563 | 起初 1564 | 起头 1565 | 起来 1566 | 起见 1567 | 起首 1568 | 趁 1569 | 趁便 1570 | 趁势 1571 | 趁早 1572 | 趁机 1573 | 趁热 1574 | 趁着 1575 | 越是 1576 | 距 1577 | 跟 1578 | 路经 1579 | 转动 1580 | 转变 1581 | 转贴 1582 | 轰然 1583 | 较 1584 | 较为 1585 | 较之 1586 | 较比 1587 | 边 1588 | 达到 1589 | 达旦 1590 | 迄 1591 | 迅速 1592 | 过 1593 | 过于 1594 | 过去 1595 | 过来 1596 | 运用 1597 | 近 1598 | 近几年来 1599 | 近年来 1600 | 近来 1601 | 还 1602 | 还是 1603 | 还有 1604 | 还要 1605 | 这 1606 | 这一来 1607 | 这个 1608 | 这么 1609 | 这么些 1610 | 这么样 1611 | 这么点儿 1612 | 这些 1613 | 这会儿 1614 | 这儿 1615 | 这就是说 1616 | 这时 1617 | 这样 1618 | 这次 1619 | 这点 1620 | 这种 1621 | 这般 1622 | 这边 1623 | 这里 1624 | 这麽 1625 | 进入 1626 | 进去 1627 | 进来 1628 | 进步 1629 | 进而 1630 | 进行 1631 | 连 1632 | 连同 1633 | 连声 1634 | 连日 1635 | 连日来 1636 | 连袂 1637 | 连连 1638 | 迟早 1639 | 迫于 1640 | 适应 1641 | 适当 1642 | 适用 1643 | 逐步 1644 | 逐渐 1645 | 通常 1646 | 通过 1647 | 造成 1648 | 逢 1649 | 遇到 1650 | 遭到 1651 | 遵循 1652 | 遵照 1653 | 避免 1654 | 那 1655 | 那个 1656 | 那么 1657 | 那么些 1658 | 那么样 1659 | 那些 1660 | 那会儿 1661 | 那儿 1662 | 那时 1663 | 那末 1664 | 那样 1665 | 那般 1666 | 那边 1667 | 那里 1668 | 那麽 1669 | 部分 1670 | 都 1671 | 鄙人 1672 | 采取 1673 | 里面 1674 | 重大 1675 | 重新 1676 | 重要 1677 | 鉴于 1678 | 针对 1679 | 长期以来 1680 | 长此下去 1681 | 长线 1682 | 长话短说 1683 | 问题 1684 | 间或 1685 | 防止 1686 | 阿 1687 | 附近 1688 | 陈年 1689 | 限制 1690 | 陡然 1691 | 除 1692 | 除了 1693 | 除却 1694 | 除去 1695 | 除外 1696 | 除开 1697 | 除此 1698 | 除此之外 1699 | 除此以外 1700 | 除此而外 1701 | 除非 1702 | 随 1703 | 随后 1704 | 随时 1705 | 随着 1706 | 随著 1707 | 隔夜 1708 | 隔日 1709 | 难得 1710 | 难怪 1711 | 难说 1712 | 难道 1713 | 难道说 1714 | 集中 1715 | 零 1716 | 需要 1717 | 非但 1718 | 非常 1719 | 非徒 1720 | 非得 1721 | 非特 1722 | 非独 1723 | 靠 1724 | 顶多 1725 | 顷 1726 | 顷刻 1727 | 顷刻之间 1728 | 顷刻间 1729 | 顺 1730 | 顺着 1731 | 顿时 1732 | 颇 1733 | 风雨无阻 1734 | 饱 1735 | 首先 1736 | 马上 1737 | 高低 1738 | 高兴 1739 | 默然 1740 | 默默地 1741 | 齐 1742 | ︿ 1743 | ！ 1744 | ＃ 1745 | ＄ 1746 | ％ 1747 | ＆ 1748 | ＇ 1749 | （ 1750 | ） 1751 | ）÷（１－ 1752 | ）、 1753 | ＊ 1754 | ＋ 1755 | ＋ξ 1756 | ＋＋ 1757 | ， 1758 | ，也 1759 | － 1760 | －β 1761 | －－ 1762 | －［＊］－ 1763 | ． 1764 | ／ 1765 | ０ 1766 | ０：２ 1767 | １ 1768 | １． 1769 | １２％ 1770 | ２ 1771 | ２．３％ 1772 | ３ 1773 | ４ 1774 | ５ 1775 | ５：０ 1776 | ６ 1777 | ７ 1778 | ８ 1779 | ９ 1780 | ： 1781 | ； 1782 | ＜ 1783 | ＜± 1784 | ＜Δ 1785 | ＜λ 1786 | ＜φ 1787 | ＜＜ 1788 | ＝ 1789 | ＝″ 1790 | ＝☆ 1791 | ＝（ 1792 | ＝－ 1793 | ＝［ 1794 | ＝｛ 1795 | ＞ 1796 | ＞λ 1797 | ？ 1798 | ＠ 1799 | Ａ 1800 | ＬＩ 1801 | Ｒ．Ｌ． 1802 | ＺＸＦＩＴＬ 1803 | 1804 | ［＊］ 1805 | ［－ 1806 | ［］ 1807 | ］ 1808 | ］∧′＝［ 1809 | ］［ 1810 | ＿ 1811 | ａ］ 1812 | ｂ］ 1813 | ｃ］ 1814 | ｅ］ 1815 | ｆ］ 1816 | ｎｇ昉 1817 | ｛ 1818 | ｛－ 1819 | ｜ 1820 | ｝ 1821 | ｝＞ 1822 | ～ 1823 | ～± 1824 | ～＋ 1825 | ￥ 1826 | secondly 1827 | all 1828 | whose 1829 | under 1830 | sorry 1831 | four 1832 | we'll 1833 | somewhere 1834 | likely 1835 | even 1836 | above 1837 | ever 1838 | never 1839 | ZZ 1840 | hers 1841 | i'd 1842 | howbeit 1843 | i'm 1844 | theres 1845 | changes 1846 | anyhow 1847 | would 1848 | therefore 1849 | is 1850 | hereby 1851 | must 1852 | me 1853 | my 1854 | indicated 1855 | indicates 1856 | keep 1857 | far 1858 | after 1859 | hereupon 1860 | keeps 1861 | every 1862 | over 1863 | before 1864 | better 1865 | then 1866 | them 1867 | they 1868 | reasonably 1869 | each 1870 | went 1871 | mean 1872 | we'd 1873 | rd 1874 | re 1875 | got 1876 | forth 1877 | you're 1878 | little 1879 | whereupon 1880 | uses 1881 | already 1882 | another 1883 | took 1884 | second 1885 | seen 1886 | seem 1887 | relatively 1888 | thoroughly 1889 | latter 1890 | that 1891 | thorough 1892 | nobody 1893 | definitely 1894 | came 1895 | saying 1896 | specify 1897 | do 1898 | next 1899 | despite 1900 | unfortunately 1901 | twice 1902 | best 1903 | said 1904 | away 1905 | there's 1906 | unto 1907 | hopefully 1908 | seven 1909 | we 1910 | ltd 1911 | here 1912 | against 1913 | com 1914 | ZT 1915 | aren't 1916 | been 1917 | much 1918 | concerning 1919 | wish 1920 | say 1921 | near 1922 | unlikely 1923 | cant 1924 | in 1925 | ie 1926 | if 1927 | containing 1928 | beside 1929 | several 1930 | kept 1931 | whereby 1932 | whoever 1933 | the 1934 | yours 1935 | just 1936 | yes 1937 | yet 1938 | had 1939 | has 1940 | t's 1941 | possible 1942 | apart 1943 | right 1944 | old 1945 | somehow 1946 | for 1947 | everything 1948 | asking 1949 | who 1950 | of 1951 | theirs 1952 | plus 1953 | formerly 1954 | down 1955 | c's 1956 | accordingly 1957 | way 1958 | was 1959 | becoming 1960 | tell 1961 | sometime 1962 | no 1963 | whereas 1964 | nd 1965 | welcome 1966 | let's 1967 | certainly 1968 | a's 1969 | did 1970 | it'll 1971 | says 1972 | appear 1973 | alone 1974 | wherever 1975 | example 1976 | usually 1977 | nowhere 1978 | hither 1979 | regardless 1980 | everybody 1981 | thru 1982 | everywhere 1983 | can 1984 | following 1985 | want 1986 | didn't 1987 | may 1988 | such 1989 | whenever 1990 | maybe 1991 | ones 1992 | so 1993 | seeing 1994 | indeed 1995 | course 1996 | still 1997 | thank 1998 | he's 1999 | selves 2000 | ours 2001 | outside 2002 | non 2003 | within 2004 | thereby 2005 | not 2006 | now 2007 | nor 2008 | entirely 2009 | eg 2010 | ex 2011 | et 2012 | hadn't 2013 | furthermore 2014 | looking 2015 | seriously 2016 | shouldn't 2017 | she 2018 | quite 2019 | besides 2020 | think 2021 | first 2022 | ignored 2023 | awfully 2024 | given 2025 | anyone 2026 | indicate 2027 | gives 2028 | mostly 2029 | than 2030 | here's 2031 | were 2032 | and 2033 | appreciate 2034 | himself 2035 | saw 2036 | any 2037 | downwards 2038 | take 2039 | sure 2040 | especially 2041 | later 2042 | that's 2043 | fifth 2044 | don't 2045 | aside 2046 | only 2047 | going 2048 | get 2049 | truly 2050 | cannot 2051 | nearly 2052 | regarding 2053 | us 2054 | where 2055 | up 2056 | namely 2057 | anyways 2058 | wonder 2059 | behind 2060 | between 2061 | it 2062 | across 2063 | come 2064 | many 2065 | whereafter 2066 | according 2067 | comes 2068 | afterwards 2069 | couldn't 2070 | moreover 2071 | considering 2072 | sensible 2073 | hardly 2074 | wants 2075 | former 2076 | those 2077 | these 2078 | [ 2079 | somebody 2080 | different 2081 | etc 2082 | insofar 2083 | same 2084 | without 2085 | can't 2086 | very 2087 | you've 2088 | among 2089 | being 2090 | we've 2091 | seems 2092 | around 2093 | using 2094 | specified 2095 | on 2096 | ok 2097 | oh 2098 | whence 2099 | it's 2100 | or 2101 | everyone 2102 | your 2103 | her 2104 | there 2105 | amongst 2106 | trying 2107 | with 2108 | they're 2109 | wasn't 2110 | gone 2111 | certain 2112 | am 2113 | an 2114 | as 2115 | at 2116 | again 2117 | serious 2118 | hello 2119 | since 2120 | consider 2121 | causes 2122 | to 2123 | th 2124 | myself 2125 | i'll 2126 | zero 2127 | further 2128 | what 2129 | brief 2130 | seemed 2131 | c'mon 2132 | allows 2133 | followed 2134 | ask 2135 | viz 2136 | contains 2137 | two 2138 | taken 2139 | more 2140 | knows 2141 | ain't 2142 | particular 2143 | known 2144 | none 2145 | nine 2146 | needs 2147 | rather 2148 | ［ 2149 | okay 2150 | tried 2151 | tries 2152 | onto 2153 | perhaps 2154 | specifying 2155 | ] 2156 | help 2157 | soon 2158 | through 2159 | its 2160 | seeming 2161 | inward 2162 | actually 2163 | might 2164 | haven't 2165 | someone 2166 | hereafter 2167 | always 2168 | isn't 2169 | beyond 2170 | really 2171 | they'll 2172 | enough 2173 | thereafter 2174 | done 2175 | together 2176 | least 2177 | too 2178 | immediate 2179 | believe 2180 | gotten 2181 | toward 2182 | self 2183 | also 2184 | towards 2185 | most 2186 | nothing 2187 | they'd 2188 | sometimes 2189 | lest 2190 | particularly 2191 | somewhat 2192 | his 2193 | goes 2194 | meanwhile 2195 | during 2196 | him 2197 | greetings 2198 | see 2199 | are 2200 | currently 2201 | please 2202 | various 2203 | probably 2204 | available 2205 | both 2206 | last 2207 | wouldn't 2208 | became 2209 | whole 2210 | liked 2211 | whatever 2212 | except 2213 | throughout 2214 | along 2215 | described 2216 | though 2217 | whom 2218 | beforehand 2219 | what's 2220 | new 2221 | else 2222 | look 2223 | while 2224 | herein 2225 | itself 2226 | wherein 2227 | used 2228 | anybody 2229 | obviously 2230 | thats 2231 | from 2232 | useful 2233 | merely 2234 | follows 2235 | often 2236 | some 2237 | ourselves 2238 | shall 2239 | per 2240 | tends 2241 | either 2242 | be 2243 | by 2244 | anything 2245 | consequently 2246 | into 2247 | appropriate 2248 | we're 2249 | elsewhere 2250 | hasn't 2251 | un 2252 | noone 2253 | associated 2254 | thanks 2255 | having 2256 | once 2257 | edu 2258 | go 2259 | sent 2260 | provides 2261 | yourselves 2262 | they've 2263 | try 2264 | this 2265 | you'd 2266 | yourself 2267 | zz 2268 | zt 2269 | respectively 2270 | let 2271 | others 2272 | until 2273 | weren't 2274 | use 2275 | few 2276 | themselves 2277 | becomes 2278 | anywhere 2279 | something 2280 | six 2281 | allow 2282 | won't 2283 | thence 2284 | willing 2285 | instead 2286 | whither 2287 | doing 2288 | how 2289 | cause 2290 | thereupon 2291 | que 2292 | via 2293 | could 2294 | hence 2295 | third 2296 | doesn't 2297 | their 2298 | exactly 2299 | regards 2300 | herself 2301 | have 2302 | need 2303 | clearly 2304 | i've 2305 | able 2306 | which 2307 | unless 2308 | where's 2309 | eight 2310 | why 2311 | you'll 2312 | normally 2313 | anyway 2314 | one 2315 | should 2316 | mainly 2317 | overall 2318 | qv 2319 | contain 2320 | looks 2321 | neither 2322 | however 2323 | otherwise 2324 | co 2325 | it'd 2326 | corresponding 2327 | thanx 2328 | novel 2329 | value 2330 | will 2331 | almost 2332 | thus 2333 | vs 2334 | when 2335 | gets 2336 | upon 2337 | off 2338 | nevertheless 2339 | well 2340 | less 2341 | presumably 2342 | ought 2343 | who's 2344 | five 2345 | know 2346 | you 2347 | name 2348 | necessary 2349 | like 2350 | become 2351 | therein 2352 | because 2353 | happens 2354 | does 2355 | although 2356 | about 2357 | getting 2358 | own 2359 | three 2360 | inasmuch 2361 | inner 2362 | but 2363 | hi 2364 | he 2365 | whether 2366 | placed 2367 | below 2368 | our 2369 | 上去-- 2370 | inc 2371 | lately 2372 | other 2373 | latterly 2374 | out 2375 | 是什么 2376 | 什么时候 2377 | 是什么意思 2378 | 什么意思 2379 | 多少钱 2380 | 有没有 2381 | 更有趣 2382 | 更有甚者 2383 | 更有效 2384 | 更有意义 2385 | 更远的 2386 | 更重要的是 2387 | 正确 2388 | 错误 2389 | 第二把 2390 | 第二波 2391 | 第二大节 2392 | 第二单元 2393 | 第二关 2394 | 第二行 2395 | 第二集 2396 | 第二讲 2397 | 第二款 2398 | 第二类 2399 | 第二盘 2400 | 第二任 2401 | 第二声 2402 | 第二十 2403 | 第二首 2404 | 第二项 2405 | 第三遍 2406 | 第三册 2407 | 第三层 2408 | 第三产业 2409 | 第三大 2410 | 第三单元 2411 | 第三行 2412 | 第三回 2413 | 第三集 2414 | 第三件 2415 | 第三句 2416 | 第三卷 2417 | 第三课 2418 | 第三类 2419 | 第三篇 2420 | 第三期 2421 | 第三日 2422 | 第三声 2423 | 地三鲜 2424 | 第三项 2425 | 第三站 2426 | 第三张 2427 | 第十八 2428 | 第十次 2429 | 第十二 2430 | 的士高 2431 | 第十集 2432 | 第十届 2433 | 第十九 2434 | 第十六 2435 | 第十名 2436 | 第十三 2437 | 第十四 2438 | 第十天 2439 | 第十一 2440 | 第十一个 2441 | 第四版 2442 | 第四册 2443 | 第四场 2444 | 第四代 2445 | 第四单元 2446 | 第四集 2447 | 第四届 2448 | 第四年 2449 | 第四期 2450 | 第四声 2451 | 第四套 2452 | 第四位 2453 | 第四张 2454 | 第四者 2455 | 第四种 2456 | 第五部 2457 | 第五大道 2458 | 第五单元 2459 | 第五集 2460 | 第五卷 2461 | 第五课 2462 | 第五年 2463 | 第五期 2464 | 第五位 2465 | 第五元素 2466 | 第五组 2467 | 召唤 2468 | 最后一班 2469 | 最后一遍 2470 | 最后一关 2471 | 最后一集 2472 | 最后一科 2473 | 最后一颗子弹 2474 | 最后一派 2475 | 最后一题 2476 | 最后一眼 2477 | 最后一页 2478 | 10 2479 | 11 2480 | 12 2481 | 35 2482 | 25 2483 | 2016 2484 | 2015 2485 | 2014 2486 | 又为什么 2487 | 有问题吗 2488 | 有问题么 2489 | 又喜欢 2490 | 有喜欢 2491 | 又小 2492 | 又笑 2493 | 有笑 2494 | 有效地 2495 | 有一百 2496 | 又一遍 2497 | 有一部 2498 | 又一城 2499 | 又一村 2500 | 有一道 2501 | 有意的 2502 | 有一堆 2503 | 有一对 2504 | 有一方 2505 | 有一根 2506 | 有一会了 2507 | 有一批 2508 | 有一片 2509 | 有一期 2510 | 有一起 2511 | 有一群 2512 | 又又 2513 | 由由 2514 | 财新网 2515 | 上午 2516 | 下午 2517 | NULL 2518 | 新华社 2519 | 消息 2520 | 13 2521 | 14 2522 | 15 2523 | 16 2524 | 17 2525 | 18 2526 | 19 2527 | 20 2528 | 21 2529 | 22 2530 | 23 2531 | 24 2532 | 26 2533 | 27 2534 | 28 2535 | 29 2536 | 30 2537 | 31 2538 | 32 2539 | 33 2540 | 34 2541 | 36 2542 | 37 2543 | 38 2544 | 39 2545 | 40 2546 | 41 2547 | 42 2548 | 43 2549 | 44 2550 | 45 2551 | 46 2552 | 47 2553 | 48 2554 | 49 2555 | 50 2556 | 51 2557 | 52 2558 | 53 2559 | 54 2560 | 55 2561 | 56 2562 | 57 2563 | 58 2564 | 59 2565 | 60 2566 | 61 2567 | 62 2568 | 63 2569 | 64 2570 | 65 2571 | 66 2572 | 67 2573 | 68 2574 | 69 2575 | 70 2576 | 71 2577 | 72 2578 | 73 2579 | 74 2580 | 75 2581 | 76 2582 | 77 2583 | 78 2584 | 79 2585 | 80 2586 | 81 2587 | 82 2588 | 83 2589 | 84 2590 | 85 2591 | 86 2592 | 87 2593 | 88 2594 | 89 2595 | 90 2596 | 91 2597 | 92 2598 | 93 2599 | 94 2600 | 95 2601 | 96 2602 | 97 2603 | 98 2604 | 99 2605 | 100 2606 | 01 2607 | 02 2608 | 03 2609 | 04 2610 | 05 2611 | 06 2612 | 07 2613 | 08 2614 | 09 2615 | 达川 2616 | 达州 2617 | 达州市 2618 | 2018 2619 | 现场 2620 | 发生 2621 | 被困 2622 | 发现 2623 | 事件 2624 | 视频 2625 | 介绍 2626 | 成都 2627 | 四川 2628 | 城市 2629 | 地面 2630 | 狂犬病 2631 | 聊城市 2632 | APP 2633 | 威望 2634 | 一新 -------------------------------------------------------------------------------- /clean/相识度计算.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: 相识度计算.py 8 | @author: yuanlang 9 | @time: 2019-08-06 15:13 10 | --------------------------------------- 11 | """ 12 | import jieba 13 | # import Levenshtein 14 | import difflib 15 | import numpy as np 16 | import pymysql 17 | 18 | # jieba.load_userdict("dict.txt") 19 | 20 | class StrSimilarity(): 21 | 22 | __stop_words=["苑","园","大厦","大街","None","公寓","里","花园","公园","小区","期","区"] 23 | 24 | def __init__(self, word): 25 | self.word = word 26 | 27 | # def stopwordslist(self,filepath): 28 | # stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] 29 | # return stopwords 30 | 31 | def stopwordslist(self, filepath): 32 | stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] 33 | return stopwords 34 | 35 | # 对句子去除停用词 36 | def movestopwords(self,sentence): 37 | # stopwords = self.stopwordslist('语料/hlt_stop_words.txt') # 这里加载停用词的路径 38 | outstr = '' 39 | for word in sentence: 40 | if word not in self.__stop_words: 41 | if word != '\t' and '\n': 42 | outstr += word 43 | # outstr += " " 44 | return outstr 45 | 46 | # Compared函数，参数str_list是对比字符串列表 47 | # 返回原始字符串分词后和对比字符串的匹配次数，返回一个字典 48 | def Compared(self, str_list): 49 | dict_data = {} 50 | sarticiple = self.movestopwords(jieba.cut(self.word.strip())) 51 | for strs in str_list: 52 | num = 0 53 | for sart in sarticiple: 54 | if sart in strs: 55 | num = num + 1 56 | else: 57 | num = num 58 | dict_data[strs] = num 59 | return dict_data 60 | 61 | # NumChecks函数，参数dict_data是原始字符串分词后和对比字符串的匹配次数的字典，也就是Compared函数的返回值 62 | # 返回出现次数最高的两个，返回一个字典 63 | def NumChecks(self, dict_data): 64 | list_data = sorted(dict_data.items(), key=lambda asd: asd[1], reverse=True) 65 | length = len(list_data) 66 | json_data = {} 67 | if length >= 2: 68 | datas = list_data[:2] 69 | else: 70 | datas = list_data[:length] 71 | for data in datas: 72 | json_data[data[0]] = data[1] 73 | return json_data 74 | 75 | # MMedian函数，参数dict_data是出现次数最高的两个对比字符串的字典，也就是NumChecks函数的返回值 76 | # 返回对比字符串和调节值的字典 77 | def MMedian(self, dict_data): 78 | median_list = {} 79 | length = len(self.word) 80 | for k, v in dict_data.items(): 81 | num = np.median([len(k), length]) 82 | if abs(length - num) != 0: 83 | # xx = (1.0/(abs(length-num)))*0.1 84 | xx = (abs(length - num)) * 0.017 85 | else: 86 | xx = 0 87 | median_list[k] = xx 88 | return median_list 89 | 90 | # Appear函数，参数dict_data是对比字符串和调节值的字典，也就是MMedian函数的返回值 91 | # 返回最相似的字符串 92 | def Appear(self, dict_data): 93 | json_data = {} 94 | for k, v in dict_data.items(): 95 | fraction = difflib.SequenceMatcher(None, self.word, k).quick_ratio() - v 96 | json_data[k] = fraction 97 | tulp_data = sorted(json_data.items(), key=lambda asd: asd[1], reverse=True) 98 | return tulp_data[0] 99 | 100 | 101 | def main(can_zhao_biao,mu_biao_biao): 102 | conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8") 103 | cursor = conn.cursor() 104 | 105 | # cursor.execute("select url,title from seed;") 106 | # lj_items = cursor.fetchall() 107 | # str_list = [] 108 | # _dict = {} 109 | # for lj_item in lj_items: 110 | # aim = "{0}{1}{2}{3}{4}".format(lj_item[1], lj_item[2], lj_item[3], lj_item[4], lj_item[4]) 111 | # str_list.append(aim) 112 | # _dict[aim] = lj_item[0] 113 | 114 | str_list="2018年10月7日达川区南外济民医院门口突然塌陷事件" 115 | 116 | while True: 117 | cursor.execute("select url,title from seed limit 1") 118 | f5_items = cursor.fetchall() 119 | if len(f5_items) == 0: 120 | break 121 | 122 | query_str,query_id= '','' 123 | for f5_item in f5_items: 124 | query_id=f5_item[0] 125 | query_str = f5_item[1] 126 | 127 | ss = StrSimilarity(query_str) 128 | list_data = ss.Compared(str_list) 129 | num = ss.NumChecks(list_data) 130 | mmedian = ss.MMedian(num) 131 | print(query_str+" ===> "+ss.Appear(mmedian)[0]+":"+str(ss.Appear(mmedian)[1])) 132 | 133 | # sql="update %s set lj_xiaoqu_id='%s',ration=%12.10f where xiaoqu_id='%s'"%\ 134 | # (mu_biao_biao,_dict[ss.Appear(mmedian)[0]],ss.Appear(mmedian)[1],query_id) 135 | # cursor.execute(sql) 136 | # conn.commit() 137 | 138 | if __name__ == "__main__": 139 | #参照表hs_community_dict_fang 140 | can_zhao_biao="poi_ration" 141 | #目标表 142 | mu_biao_biao="shop_ration" 143 | main(can_zhao_biao,mu_biao_biao) -------------------------------------------------------------------------------- /clean/词频统计_LDA主题模型.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 数据整理 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: 词频统计_LDA主题模型.py 8 | @author: yuanlang 9 | @time: 2019-08-07 10:00 10 | --------------------------------------- 11 | """ 12 | import os 13 | import jieba 14 | import pymysql 15 | import pandas as pd 16 | import gensim 17 | import numpy 18 | import matplotlib.pyplot as plt 19 | from wordcloud import WordCloud#词云包 20 | from gensim import corpora, models, similarities 21 | # 编码问题 22 | plt.rcParams['figure.figsize'] = (5.0, 5.0) 23 | plt.rcParams['font.sans-serif'] = ['simhei'] 24 | plt.rcParams['axes.unicode_minus'] = False 25 | 26 | print(os.path.dirname(__file__)) 27 | # 导入停用词 28 | stopwords=pd.read_csv(f"{os.path.dirname(__file__)}/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') 29 | stopwords=stopwords['stopword'].values 30 | 31 | # 读取新闻内容 32 | df = pd.read_csv(f"{os.path.dirname(__file__)}/地陷事件.csv", encoding='utf-8',sep = '&@@&') 33 | # df = pd.read_csv(f"{os.path.dirname(__file__)}/出租车罢工.csv", encoding='utf-8',sep = '&@@&') 34 | # df = pd.read_csv(f"{os.path.dirname(__file__)}/好一新大火.csv", encoding='utf-8',sep = '&@@&') 35 | 36 | x=0 37 | lines=[((++x),item) for item in df.content.values.tolist()] 38 | 39 | # 原始数据 40 | # conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8") 41 | # cursor = conn.cursor() 42 | # cursor.execute("select * from context") 43 | # lines=cursor.fetchall() 44 | 45 | def db_to_csv(lines): 46 | """保存到本地""" 47 | with open("好一新大火.csv","w",encoding="utf-8") as f: 48 | f.writelines("url&@@&content\n") 49 | for line in lines: 50 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "") 51 | print(text) 52 | f.writelines("\""+line[0]+"\""+"&@@&"+"\""+text+"\"\n") 53 | 54 | # db_to_csv(lines) 55 | 56 | def word_count(lines,stopwords): 57 | # 词频统计 58 | segment = [] 59 | for line in lines: 60 | try: 61 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "") 62 | segs = jieba.__lcut(text) 63 | for seg in segs: 64 | if len(seg) > 1 and seg != '\r\n' and seg not in stopwords: 65 | segment.append(seg) 66 | # print(segment) 67 | except Exception as e: 68 | print(e) 69 | 70 | words_df = pd.DataFrame({'segment': segment}) 71 | words_stat = words_df.groupby(by=['segment'])['segment'].agg(["size"]) 72 | words_stat = words_stat[1300:] 73 | words_stat = words_stat.reset_index().sort_values(by=["size"], ascending=False) 74 | print(words_stat[:1500]) 75 | wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) 76 | word_frequence = {x[0]: x[1] for x in words_stat.head(1500).values} 77 | wordcloud = wordcloud.fit_words(word_frequence) 78 | plt.imshow(wordcloud) 79 | plt.show() 80 | 81 | # word_count(lines,stopwords) 82 | 83 | def lda(lines,stopwords): 84 | """lda主题""" 85 | sentences = [] 86 | for line in lines: 87 | try: 88 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "") 89 | segs = jieba.__lcut(text) 90 | segs = filter(lambda x: len(x) > 1, segs) 91 | segs = [seg for seg in list(segs) if seg not in stopwords] 92 | sentences.append(segs) 93 | except Exception as e: 94 | print(e) 95 | 96 | # 词袋模型 97 | dictionary = corpora.Dictionary(sentences) 98 | corpus = [dictionary.doc2bow(_sentence) for _sentence in sentences] 99 | lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) 100 | 101 | # 主题模型打印 102 | print(lda.print_topics()) 103 | wors={} 104 | for topic in lda.print_topics(): 105 | words=topic[1].split("+") 106 | for word in words: 107 | ss=[ii.replace(" ","").replace("\"","") for ii in word.split("*")] 108 | print(wors.get(ss[1],0),ss[0],wors.get(ss[1],0)+float(ss[0])) 109 | wors[ss[1]]=wors.get(ss[1],0)+float(ss[0]) 110 | # print(ss) 111 | wors={x:float('%.3f'%y) for x,y in wors.items()} 112 | 113 | # 合并词 114 | data_dic = {'count': wors} 115 | data_df = pd.DataFrame(data_dic) 116 | data_df = data_df.reset_index().sort_values(by=["count"], ascending=False) 117 | print(data_df[:10]["index"]) 118 | print(data_df[:10].index) 119 | print(data_df[:10]["count"]) 120 | 121 | number = numpy.array(data_df[:10]["count"].values*1000) 122 | work_type = data_df[:10]["index"].values 123 | 124 | 125 | labels = tuple(work_type) 126 | fracs = number 127 | 128 | print(labels) 129 | plt.pie(x=fracs, labels=labels, autopct='%.0f%%') # autopct显示百分比 130 | plt.show() 131 | 132 | 133 | lda(lines,stopwords) -------------------------------------------------------------------------------- /dz_spider/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: __init__.py.py 8 | @author: yuanlang 9 | @time: 2019-07-26 17:50 10 | --------------------------------------- 11 | """ -------------------------------------------------------------------------------- /dz_spider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__init__.py -------------------------------------------------------------------------------- /dz_spider/dz_spider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/common.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: common.py 8 | @author: yuanlang 9 | @time: 2019-07-26 17:50 10 | --------------------------------------- 11 | """ 12 | 13 | site_name=["凤凰山下","达州市人民政府","闽南网","新京报网"] 14 | 15 | # 种子表 16 | seed_table="""create table if not exists `seed`( 17 | `url` varchar(500) Not null, 18 | `title` varchar(500) default "", 19 | `site_name` char(10) default "", 20 | `status` int(2) default 0, 21 | `create_time` timestamp default current_timestamp, 22 | `update_time` timestamp default current_timestamp, 23 | primary key (`url`) 24 | )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 25 | """ 26 | 27 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DzSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # 随机更换user agent 3 | import random 4 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 5 | import base64 6 | import requests 7 | import redis 8 | import datetime 9 | import time 10 | 11 | 12 | class RotateUserAgentMiddleware(UserAgentMiddleware): 13 | def __init__(self, user_agent=''): 14 | self.user_agent = user_agent 15 | self._redis=redis.Redis(host="10.29.4.242",port=6379,db=0) 16 | # self.start_time=datetime.datetime.now() 17 | 18 | 19 | def get_proxy(self,name): 20 | key = self._redis.hgetall(name=name) 21 | rkey = random.choice(list(key.keys())) if key else None 22 | if isinstance(rkey, bytes): 23 | return rkey.decode('utf-8') 24 | else: 25 | return rkey 26 | 27 | def process_request(self, request, spider): 28 | ua = random.choice(self.user_agent_list) 29 | #ip = random.choice(self.ip_proxy) 30 | # ip = self.get_proxy("useful_proxy") 31 | if ua: 32 | request.headers.setdefault('User-Agent', ua) 33 | # 设置代理 34 | # request.meta['proxy'] = 'http://{0}'.format(ip) 35 | 36 | #response=requests.get("http://10.29.4.242:5010/get/") 37 | #print('http://{0}'.format(response.text)) 38 | #request.meta['proxy'] = 'http://{0}'.format(response.text) 39 | # request.meta['proxy'] = "127.0.0.1:8888" 40 | # proxy_user_pass = 'XXXXXXXXXXXXXXX:KKKKKKKKKKKKKKKK' 41 | # encoded_user_pass = base64.b64encode(proxy_user_pass.encode(encoding='utf-8')) 42 | # request.headers['Proxy-Authorization'] = 'Basic ' + str(encoded_user_pass) 43 | 44 | ip_proxy = ['101.50.1.2:80', '54.36.1.22:3128', '178.238.228.187:9090', '149.56.108.133:3128', '190.2.137.31:1080', '13.125.162.226:3128', '128.199.182.128:3128', '122.216.120.254:80', '157.55.233.183:80', '85.10.247.140:1080', '90.84.242.77:3128', '159.65.156.178:3128', '54.38.100.98:1080', '119.28.221.28:8088', '139.224.24.26:8888', '190.2.137.9:1080', '178.32.181.66:3128', '47.88.35.91:3128', '103.78.213.147:80', '59.44.164.34:3128', '190.2.137.15:1080', '54.36.31.203:3128', '142.44.198.187:3128', '122.114.31.177:808', '66.195.76.86:8080', '122.216.120.244:80', '212.237.34.18:8888', '134.119.205.147:1080', '159.89.201.219:3128', '50.28.48.83:8080', '211.159.219.158:80', '124.51.247.48:3128', '35.162.122.16:8888', '217.182.242.64:3128', '139.59.21.37:3128', '47.89.23.174:8080', '200.16.208.187:8080', '5.135.74.36:1080', '117.242.145.103:8080', '61.5.207.102:80', '61.135.217.7:80', '71.13.112.152:3128', '5.135.74.37:1080', '211.159.177.212:3128', '210.5.149.43:8090', '122.72.18.35:80', '212.237.51.54:8888', '61.136.163.245:8107', '124.193.37.5:8888', '120.78.182.79:3128', '180.173.67.197:9797', '171.97.67.88:3128', '145.239.185.127:1080', '167.99.70.26:8080', '159.65.141.81:3128', '180.235.42.148:8080', '67.205.159.46:3128', '121.8.98.198:80', '151.80.140.233:54566', '139.59.224.113:8080', '47.91.165.126:80', '5.9.78.89:3128', '142.44.202.122:3128', '35.198.103.196:3128', '39.137.47.11:80', '142.44.197.15:3128', '190.2.137.38:1080', '122.216.120.251:80', '159.65.139.226:3128', '116.11.254.37:80', '36.80.123.114:3128', '194.67.220.181:3128', '217.182.216.236:3128', '190.2.137.47:1080', '163.172.217.103:3128', '145.239.185.122:1080', '212.237.37.152:8888', '219.135.164.245:3128', '119.28.26.57:3128', '120.77.254.116:3128', '60.207.106.140:3128', '14.139.189.216:3128', '212.126.117.158:80', '120.26.160.183:8090', '142.44.198.121:3128', '218.50.2.102:8080', '183.179.199.225:8080', '116.58.227.143:3128', '144.202.70.37:3128', '119.28.112.130:3128', '45.63.95.172:3128', '167.99.87.147:8080', '202.175.61.162:8080', '200.63.129.131:80', '194.182.74.203:3128', '77.244.21.75:3128', '118.212.137.135:31288', '145.239.185.121:1080', '190.2.137.45:1080', '5.167.54.154:8080', '50.233.137.38:80', '112.21.164.58:1080', '45.76.56.140:3128', '35.200.194.218:3128', '159.65.142.92:3128', '37.204.219.50:8081', '113.214.13.1:8000', '47.90.72.227:8088', '114.130.42.20:80', '119.28.152.208:80', '167.99.78.239:8080', '144.202.70.81:3128', '151.80.9.177:3128', '151.106.10.230:1080', '104.155.53.214:3128', '123.57.133.142:3128', '151.106.5.26:1080', '5.9.78.28:3128', '47.75.56.36:8118', '66.70.147.195:3128', '114.232.171.58:48354', '122.72.18.34:80', '5.135.74.32:1080', '114.130.42.20:3128'] 45 | 46 | user_agent_list = [ \ 47 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ 48 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 49 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 50 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 52 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 53 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 54 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 55 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 56 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 57 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 58 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 59 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 60 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 61 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 62 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 63 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 64 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 65 | ] -------------------------------------------------------------------------------- /dz_spider/dz_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pymongo 3 | from scrapy import log 4 | from scrapy.conf import settings 5 | import threading 6 | from openpyxl import Workbook 7 | import redis 8 | from scrapy.pipelines.images import ImagesPipeline 9 | from scrapy.exceptions import DropItem 10 | import scrapy 11 | import pymysql 12 | from twisted.enterprise import adbapi 13 | import random 14 | import sys 15 | from scrapy.log import logger 16 | # Define your item pipelines here 17 | # 18 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 19 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 20 | # 单例模式创建MongoPipline 21 | 22 | Lock = threading.Lock() 23 | 24 | 25 | class MongoPipeline(object): 26 | # 定义静态变量实例 27 | __instance = None 28 | 29 | def __init__(self): 30 | pass 31 | 32 | def __new__(cls, *args, **kwargs): 33 | if not cls.__instance: 34 | try: 35 | Lock.acquire() 36 | # double check 37 | if not cls.__instance: 38 | cls.client = pymongo.MongoClient(settings['MONGO_URI']) 39 | cls.db = cls.client[settings['MONGO_DATABASE']] 40 | cls.__instance = super(MongoPipeline, cls).__new__(cls, *args, **kwargs) 41 | finally: 42 | Lock.release() 43 | return cls.__instance 44 | 45 | def dorp_connection(self,db_name): 46 | return self.db[db_name].drop() 47 | 48 | def ensure_index(self,db_name,unique_id): 49 | return self.db[db_name].ensure_index(unique_id,unique=True) 50 | 51 | 52 | # def process_item(self, item,spider): 53 | # ''' 54 | # 异步增加，修改 55 | # :param item: 56 | # :param spider: 57 | # :return: 58 | # ''' 59 | # if item["operation"]=="insert": 60 | # 61 | # try: 62 | # self.db[item["db"]].insert(dict(item["info"])) 63 | # log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库". 64 | # format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]), level=log.INFO) 65 | # except Exception as e: 66 | # log.msg("[{0} line:{1}] {2}". 67 | # format(self.__class__.__name__, sys._getframe().f_lineno, e),level=log.ERROR) 68 | # 69 | # elif item["operation"]=="upsert": 70 | # self.db[item["db"]].update(item["condition"], item["info"], True) 71 | # log.msg("[{0} line:{1}] upsert {2}=====>>>>更新种子信息" 72 | # .format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]),level=log.INFO) 73 | # elif item["operation"]=="update": 74 | # self.db[item["db"]].update(item["condition"], item["info"], False) 75 | # log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息" 76 | # .format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]),level=log.INFO) 77 | 78 | def process_item(self, item,db_name): 79 | try: 80 | self.db[db_name].insert(dict(item)) 81 | log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库". 82 | format(self.__class__.__name__, sys._getframe().f_lineno,db_name), level=log.INFO) 83 | except Exception as e: 84 | log.msg("[{0} line:{1}] {2}". 85 | format(self.__class__.__name__, sys._getframe().f_lineno, e), level=log.ERROR) 86 | 87 | def process_items(self, items, db_name): 88 | try: 89 | self.db[db_name].insert(items) 90 | log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库". 91 | format(self.__class__.__name__, sys._getframe().f_lineno,db_name), level=log.INFO) 92 | except Exception as e: 93 | log.msg("[{0} line:{1}] {2}". 94 | format(self.__class__.__name__, sys._getframe().f_lineno, e), level=log.ERROR) 95 | 96 | def seed_find(self,db_name,conditions,return_range): 97 | log.msg("[{0} line:{1}] find {2}=====>>>>>小区列表页种子查询" 98 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name), 99 | level=log.INFO) 100 | return self.db[db_name].find(conditions,return_range) 101 | 102 | def info_update(self,db_name,conditions,info): 103 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息" 104 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name), 105 | level=log.INFO) 106 | return self.db[db_name].update(conditions,info,False) 107 | 108 | def info_upsert(self,db_name,conditions,info): 109 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息" 110 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name), 111 | level=log.INFO) 112 | return self.db[db_name].update(conditions,info,True) 113 | 114 | def info_update_many(self,db_name,conditions,info): 115 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息" 116 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name), 117 | level=log.INFO) 118 | return self.db[db_name].update_many(conditions,info,False) 119 | 120 | 121 | # ######################################链家房产################################ 122 | # ###小区 123 | # def lianjia_xiaoqu_insert_seed(self, seed): 124 | # ''' 125 | # 小区列表页种子入库 126 | # :param seed: 127 | # :return: 128 | # ''' 129 | # log.msg("[{0} line:{1}] insert LianJiaXiaoQuSeed=====>>>>>链家小区列表页种子入库".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 130 | # return self.db["LianJiaXiaoQuSeed"].insert(seed) 131 | # 132 | # def lianjia_xiaoqu_find_seed1(self): 133 | # ''' 134 | # 链家小区列表页种子提取 135 | # :return: 136 | # ''' 137 | # print("finid操作======》查询链家小区列表页种子") 138 | # return self.db["LianJiaXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 139 | # 140 | # def lianjia_xiaoqu_find_seed2(self): 141 | # ''' 142 | # 链家小区详细信息种子提取 143 | # :return: 144 | # ''' 145 | # print("finid操作======》查询链家详细信息页种子") 146 | # return self.db["LianJiaXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0}) 147 | # 148 | # def lianjia_xiaoqu_update_seed(self, seed): 149 | # ''' 150 | # 更新小区列表页种子状态 151 | # :param seed: 152 | # :return: 153 | # ''' 154 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuSeed=====>>>>>更新链家列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 155 | # return self.db["LianJiaXiaoQuSeed"].update_one({"url":seed["url"]}, 156 | # {"$set":{"status":seed["status"], 157 | # "ts":seed["ts"]}},True) 158 | # def lianjia_xiaoqu_img_find(self): 159 | # return self.db["LianJiaXiaoQuImg"].find({"status": 0}, {"xiaoquImgs": 1, "_id": 0,"xiaoquId":1}) 160 | # 161 | # def lianjia_xiaoqu_img_update(self,item): 162 | # return self.db["LianJiaXiaoQuImg"].update_many({"xiaoquId":item["xiaoquId"]}, 163 | # {"$set":{"status":item["status"], 164 | # "ts":item["ts"]}}) 165 | # 166 | # def lianjia_xiaoqu_update_info(self, info): 167 | # ''' 168 | # 更新链家详细页种子状态和详细信息 169 | # :param info: 170 | # :return: 171 | # ''' 172 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuInfo=====>>>>>更新家详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 173 | # return self.db["LianJiaXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]}, 174 | # {"$set":{"address": info["address"], 175 | # "build_year": info['build_year'], 176 | # "build_type": info['build_type'], 177 | # "property_cost" : info["property_cost"], 178 | # "property_company": info["property_company"], 179 | # "developer": info["developer"], 180 | # "lou_dong_count" : info["lou_dong_count"], 181 | # "house_count": info["house_count"], 182 | # "nerber_shop": info["nerber_shop"], 183 | # "longitude": info["longitude"], 184 | # "latitude": info["latitude"], 185 | # "chengjiao_url":info["chengjiao_url"], 186 | # "imgs":info["imgs"], 187 | # "nerber_xiaoqu":info["nerber_xiaoqu"], 188 | # "xiaoqu_name_other":info["xiaoqu_name_other"], 189 | # "status": info["status"], 190 | # "html":info["html"], 191 | # "follow":info["follow"], 192 | # "sale_url": info["sale_url"], 193 | # "rent_url": info["rent_url"], 194 | # "ts":info["ts"] 195 | # }},True) 196 | # 197 | # ###成交部分 198 | # def lianjia_chengjiao_insert_seed(self, seed): 199 | # ''' 200 | # 链家成交种子保存 201 | # :param seed: 202 | # :return: 203 | # ''' 204 | # log.msg("[{0} line:{1}] insert LianJiaChengJiaoFangSeed=====>>>>>插入链家成交列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 205 | # return self.db["LianJiaChengJiaoFangSeed"].insert_many(seed) 206 | # 207 | # ###二手房部分 208 | # def lianjia_ershoufang_insert_seed(self, seed): 209 | # ''' 210 | # 链家二手房种子保存 211 | # :param seed: 212 | # :return: 213 | # ''' 214 | # return self.db["LianJiaErShouFangSeed"].insert_many(seed) 215 | # 216 | # def lianjia_ershoufang_find_seed1(self): 217 | # ''' 218 | # 链家二手房列表页种子提取 219 | # :return: 220 | # ''' 221 | # return self.db["LianJiaErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 222 | # 223 | # def lianjia_ershoufang_find_seed2(self): 224 | # ''' 225 | # 链家二手房详细页种子提取 226 | # :return: 227 | # ''' 228 | # return self.db["LianJiaErShouFangInfo"].find({"status": 0}, {"url": 1, "_id": 0}) 229 | # 230 | # def lianjia_ershoufang_update_seed(self,seed): 231 | # ''' 232 | # 链家二手房列表页种子状态更新 233 | # :param seed: 234 | # :return: 235 | # ''' 236 | # print("update操作======》更新链家二手房列表页状态： "+str(seed)) 237 | # return self.db["LianJiaErShouFangSeed"].update_one({"url":seed["url"]}, 238 | # {"$set":{"status":seed["status"], 239 | # "ts":seed["ts"]}}) 240 | # def lianjia_ershoufang_update_info_seed(self,seed): 241 | # ''' 242 | # 链家二手房详细页状态更新 243 | # :param seed: 244 | # :return: 245 | # ''' 246 | # print("update操作======》更新链家二手房详细页状态： "+str(seed)) 247 | # return self.db["LianJiaErShouFangInfo"].update_one({"url":seed["url"]}, 248 | # {"$set":{"status":seed["status"], 249 | # "ts":seed["ts"]}}) 250 | # def lianjia_ershoufang_update_info(self, info): 251 | # ''' 252 | # 链家二手房详细信息更新 253 | # :param info: 254 | # :return: 255 | # ''' 256 | # print("update操作======》更新链家二手房详细信息： " + info['url']) 257 | # self.db["LianJiaErShouFangInfo"].update({"url": info['url']}, 258 | # {"$set":{"buyPoint": info["buyPoint"], 259 | # "layout": info['layout'], 260 | # "floor" : info["floor"], 261 | # "buildArea": info["buildArea"], 262 | # "layoutStructure": info["layoutStructure"], 263 | # "area": info["area"], 264 | # "buildType": info["buildType"], 265 | # "chaoXiang": info["chaoXiang"], 266 | # "buildStructure": info["buildStructure"], 267 | # "decoration": info["decoration"], 268 | # "ladderProportion": info["ladderProportion"], 269 | # "heatingMode": info["heatingMode"], 270 | # "propertyRightYear": info["propertyRightYear"], 271 | # "publishDate": info["publishDate"], 272 | # "transAttributes": info["transAttributes"], 273 | # "lastTransaction": info["lastTransaction"], 274 | # "houseUse": info["houseUse"], 275 | # "houseYear": info["houseYear"], 276 | # "propertybelong": info["propertybelong"], 277 | # "emortgage": info["emortgage"], 278 | # "backUp": info["backUp"], 279 | # "houseTag": info["houseTag"], 280 | # "traffic": info["traffic"], 281 | # "decoration_desc": info["decoration_desc"], 282 | # "layout_instru": info["layout_instru"], 283 | # "longitude": info["longitude"], 284 | # "latitude": info["latitude"], 285 | # "ts":info["ts"] 286 | # }} ,True) 287 | # 288 | # def lianjia_chengjiaofang_update_seed(self,seed): 289 | # ''' 290 | # 链家成交房列表页种子状态更新 291 | # :param seed: 292 | # :return: 293 | # ''' 294 | # log.msg("[{0} line:{1}] update LianJiaChengJiaoFangSeed=====>>>>>更新链家成交房列表页状态".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 295 | # return self.db["LianJiaChengJiaoFangSeed"].update_one({"url":seed["url"]}, 296 | # {"$set":{"status":seed["status"], 297 | # "ts":seed["ts"]}}) 298 | # 299 | # def lianjia_chengjiaofang_find_seed(self): 300 | # ''' 301 | # 链家小区交易种子提取 302 | # :return: 303 | # ''' 304 | # return self.db["LianJiaXiaoQuInfo"].find({"status": 1,"chengjiao_url":{"$ne":""}}, {"chengjiao_url": 1, "_id": 0}) 305 | # 306 | # def lianjia_chengjiaofang_find_seed1(self): 307 | # ''' 308 | # 链家成交房列表页种子提取 309 | # :return: 310 | # ''' 311 | # return self.db["LianJiaChengJiaoFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 312 | # 313 | # def lianjia_chengjiaofang_find_seed2(self): 314 | # ''' 315 | # 链家成交房详细页种子提取 316 | # :return: 317 | # ''' 318 | # return self.db["LianJiaChengJiaoFangInfo"].find({"status": 0}, {"chengjiao_url": 1, "_id": 0}) 319 | # 320 | # def lianjia_chengjiaofang_update_info(self, info): 321 | # ''' 322 | # 链家成交房详细信息更新 323 | # :param info: 324 | # :return: 325 | # ''' 326 | # log.msg("[{0} line:{1}] update LianJiaChengJiaoFangInfo=====>>>>>更新链家成交房详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 327 | # self.db["LianJiaChengJiaoFangInfo"].update({"chengjiao_url": info['chengjiao_url']}, 328 | # {"$set":{"xiaoqu_id": info["xiaoqu_id"], 329 | # "chengjiao_url": info["chengjiao_url"], 330 | # "trade_date": info["trade_date"], 331 | # "trade_channel": info["trade_channel"], 332 | # "total_price": info['total_price'], 333 | # "unit_price" : info["unit_price"], 334 | # "list_price": info["list_price"], 335 | # "transaction_cycle": info['transaction_cycle'], 336 | # "modify_price": info["modify_price"], 337 | # "watch": info["watch"], 338 | # "follow": info["follow"], 339 | # "layout": info["layout"], 340 | # "floor": info["floor"], 341 | # "build_area": info["build_area"], 342 | # "layout_structure": info["layout_structure"], 343 | # "area": info["area"], 344 | # "build_type": info["build_type"], 345 | # "orientation": info["orientation"], 346 | # "house_year": info["house_year"], 347 | # "build_year": info["build_year"], 348 | # "decoration": info["decoration"], 349 | # "build_structure": info["build_structure"], 350 | # "ladder_ratio": info["ladder_ratio"], 351 | # "heating_mode": info["heating_mode"], 352 | # "right_year": info["right_year"], 353 | # "has_elevator": info["has_elevator"], 354 | # "publish_date": info["publish_date"], 355 | # "transaction_attr": info["transaction_attr"], 356 | # "last_tranfic": info["last_tranfic"], 357 | # "house_use": info["house_use"], 358 | # "house_year": info["house_year"], 359 | # "right_belong": info["right_belong"], 360 | # "layout_instru": info["layout_instru"], 361 | # "emortgage": info["emortgage"], 362 | # "back_up": info["back_up"], 363 | # "record": info["record"], 364 | # "house_tag": info["house_tag"], 365 | # "xiaoqu_instru":info["xiaoqu_instru"], 366 | # "sax_analysis": info["sax_analysis"], 367 | # "traffic": info["traffic"], 368 | # "decoration_desc": info["decoration_desc"], 369 | # "layout_instru": info["layout_instru"], 370 | # "buy_point": info["buy_point"], 371 | # "imgs": info["imgs"], 372 | # "ts":info["ts"], 373 | # "status":info["status"], 374 | # "html":info["html"] 375 | # }} ,True) 376 | # 377 | # def lianjia_xiaoqu_update_chengjiao_seed(self, info): 378 | # ''' 379 | # 链家成交房详细信息更新 380 | # :param info: 381 | # :return: 382 | # ''' 383 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuInfo=====>>>>>更新链家小区成交房种子状态 status=2".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 384 | # self.db["LianJiaXiaoQuInfo"].update({"chengjiao_url": info['chengjiao_url']}, 385 | # {"$set":{"status": info["status"], 386 | # "ts":info["ts"] 387 | # }} ,True) 388 | # 389 | # 390 | # #房价 391 | def lianjia_fangjia_insert_seed(self, seed): 392 | ''' 393 | 房价种子入库 394 | :param seed: 395 | :return: 396 | ''' 397 | print("insert操作======》链家房价种子入库") 398 | return self.db["LianJiaFangJiaSeed"].insert(seed) 399 | # 400 | def lianjia_fangjia_update_seed(self,seed): 401 | ''' 402 | 链家成交房列表页种子状态更新 403 | :param seed: 404 | :return: 405 | ''' 406 | print("update操作======》更新链家房价种子状态： "+str(seed)) 407 | return self.db["LianJiaFangJiaSeed"].update_one({"url":seed["url"]}, 408 | {"$set":{"status":seed["status"], 409 | "ts":seed["ts"]}}) 410 | # 411 | # 412 | def lianjia_fangjia_find_seed(self): 413 | ''' 414 | :return: 415 | ''' 416 | print("finid操作======》查询链家房价种子") 417 | return self.db["LianJiaFangJiaSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 418 | # 419 | # ####租房部分 420 | # def lianjia_zufang_insert_seed(self, seed): 421 | # return self.db["LianJiaZuFangSeed"].insert_many(seed) 422 | # 423 | # ##未实现循环读取Redis 424 | # def lianjia_zufang_find_seed(self): 425 | # return self.db["LianJiaZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 426 | # 427 | # def lianjia_zufang_update_seed(self,seed): 428 | # print("update操作======》更新链家租房url状态： "+str(seed)) 429 | # return self.db["LianJiaZuFangSeed"].update_one({"url":seed["url"]}, 430 | # {"$set":{"status":seed["status"], 431 | # "ts":seed["ts"]}}) 432 | # 433 | # #################################我爱我家################################################## 434 | # ###我爱我家小区 435 | # 436 | # def f5j5j_xiaoqu_insert_seed(self, seed): 437 | # ''' 438 | # 我爱我家小区列表页种子保存 439 | # :param seed: 440 | # :return: 441 | # ''' 442 | # return self.db["F5J5JXiaoQuSeed"].insert_many(seed) 443 | # 444 | # def f5j5j_xiaoqu_find_seed1(self): 445 | # ''' 446 | # 我爱我家小区列表页种子提取 447 | # :return: 448 | # ''' 449 | # print("finid操作======》查询我爱我家小区列表页种子") 450 | # return self.db["F5J5JXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 451 | # 452 | # def f5j5j_xiaoqu_find_seed2(self): 453 | # ''' 454 | # 我爱我家小区详细信息种子提取 455 | # :return: 456 | # ''' 457 | # print("finid操作======》查询我爱我家详细信息页种子") 458 | # return self.db["F5J5JXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0}) 459 | # 460 | # def f5j5j_xiaoqu_update_seed(self, seed): 461 | # ''' 462 | # 更新小区列表页种子状态 463 | # :param seed: 464 | # :return: 465 | # ''' 466 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuSeed=====>>>>>更新我爱我家列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 467 | # return self.db["F5J5JXiaoQuSeed"].update_one({"url":seed["url"]}, 468 | # {"$set":{"status":seed["status"], 469 | # "ts":seed["ts"]}}) 470 | # 471 | # def f5j5j_xiaoqu_update_info(self, info): 472 | # ''' 473 | # 更新我爱我家详细页种子状态和详细信息 474 | # :param info: 475 | # :return: 476 | # ''' 477 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuInfo=====>>>>>更新我爱我家详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 478 | # return self.db["F5J5JXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]}, 479 | # {"$set":{"address": info["address"], 480 | # "build_year": info['build_year'], 481 | # "build_type": info['build_type'], 482 | # "lou_dong_count": info["lou_dong_count"], 483 | # "house_count": info["house_count"], 484 | # "region_three" : info["region_three"], 485 | # "region_four": info["region_four"], 486 | # "follow": info["follow"], 487 | # "chengjiao_url":info["chengjiao_url"], 488 | # "property_company": info["property_company"], 489 | # "developer": info["developer"], 490 | # "greening_rate": info["greening_rate"], 491 | # "traffic": info["traffic"], 492 | # "nerber_shop": info["nerber_shop"], 493 | # "trend": info['trend'], 494 | # "imgs": info["imgs"], 495 | # "longitude": info["longitude"], 496 | # "latitude": info["latitude"], 497 | # "status": info["status"], 498 | # "ts":info["ts"] 499 | # }} ,True) 500 | # 501 | # def f5j5j_xiaoqu_img_find(self): 502 | # return self.db["F5J5JXiaoQuImg"].find({"status": 0}, {"image_urls": 1, "_id": 0,"id":1}) 503 | # 504 | # def f5j5j_xiaoqu_img_update(self,item): 505 | # return self.db["F5J5JXiaoQuImg"].update_many({"id":item["id"]}, 506 | # {"$set":{"status":item["status"], 507 | # "ts":item["ts"]}}) 508 | # #成交 509 | # def f5j5j_chengjiao_find_seed(self): 510 | # ''' 511 | # 成交种子提取 512 | # :return: 513 | # ''' 514 | # print("finid操作======》查询我爱我家成交房详细信息页种子") 515 | # return self.db["F5J5JXiaoQuInfo"].find({"status": 1}, {"chengjiao_url": 1, "_id": 0}) 516 | # 517 | # def f5j5j_chengjiao_update_info(self,seed): 518 | # ''' 519 | # 更新种子 520 | # :return: 521 | # ''' 522 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuInfo=====>>>>>更新我爱我家小区成交房种子状态[chengjiao_url:{2}]".format(self.__class__.__name__, sys._getframe().f_lineno,seed["chengjiao_url"]), level=log.INFO) 523 | # return self.db["F5J5JXiaoQuInfo"].update_one({"chengjiao_url": seed["chengjiao_url"]}, 524 | # {"$set":{ 525 | # "status":seed["status"], 526 | # "ts":seed["ts"] 527 | # }},True) 528 | # 529 | # #二手房 530 | # def f5j5j_ershoufang_insert_seed(self, seed): 531 | # return self.db["F5J5JErShouFangSeed"].insert_many(seed) 532 | # 533 | # def f5j5j_ershoufang_find_seed1(self): 534 | # ''' 535 | # 我爱我家小区列表页种子提取 536 | # :return: 537 | # ''' 538 | # print("finid操作======》查询我爱我家小区列表页种子") 539 | # return self.db["F5J5JErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 540 | # 541 | # def f5j5j_ershoufang_find_seed2(self): 542 | # ''' 543 | # 我爱我家小区详细信息种子提取 544 | # :return: 545 | # ''' 546 | # print("finid操作======》查询我爱我家详细信息页种子") 547 | # return self.db["F5J5JErShouFangInfo"].find({"status": 0}, {"url": 1, "_id": 0}) 548 | # 549 | # def f5j5j_zufang_insert_seed(self, seed): 550 | # return self.db["F5J5JZuFangSeed"].insert_many(seed) 551 | # 552 | # def f5j5j_ershoufang_find_seed(self): 553 | # return self.db["F5J5JErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 554 | # 555 | # def f5j5j_ershoufang_update_seed(self,seed): 556 | # print("update操作======》更新我爱我家二手房url状态： "+str(seed)) 557 | # return self.db["F5J5JErShouFangSeed"].update_one({"url":seed["url"]}, 558 | # {"$set":{"status":seed["status"], 559 | # "ts":seed["ts"]}}) 560 | # 561 | # def f5j5j_ershoufang_update_info(self, info): 562 | # print("update操作======》更新我爱我家二手房详细信息： " + info['url']) 563 | # self.db["F5J5JErShouFangInfo"].update({"url": info['url']}, 564 | # {"$set": {"buyPoint": info["buyPoint"], 565 | # "layout": info['layout'], 566 | # "floor": info["floor"], 567 | # "area": info["area"], 568 | # "publishDate": info["publishDate"], 569 | # "buildYear": info["buildYear"], 570 | # "layout_instru": info["layout_instru"], 571 | # "traffic" :info["traffic"], 572 | # "taxAnalysis": info["taxAnalysis"], 573 | # "loanSituation": info["loanSituation"], 574 | # "arroundMatch": info["arroundMatch"], 575 | # "propertyMortgage": info["propertyMortgage"], 576 | # "xiaoquInfo": info["xiaoquInfo"], 577 | # "arroundMatch": info["arroundMatch"], 578 | # "status":info["status"], 579 | # "ts": info["ts"] 580 | # }}, True) 581 | # 582 | # def f5j5j_zufang_find_seed(self): 583 | # return self.db["F5J5JZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 584 | # 585 | # ###########################tc58################################# 586 | # ###五八同城小区 587 | # def tc58_xiaoqu_insert_seed(self, seed): 588 | # ''' 589 | # 小区列表页种子入库 590 | # :param seed: 591 | # :return: 592 | # ''' 593 | # print("insert操作======》tc58小区列表页种子入库") 594 | # return self.db["TC58XiaoQuSeed"].insert(seed) 595 | # 596 | # def tc58_xiaoqu_update_seed(self, info): 597 | # log.msg("update操作======》tc58小区列表页种子更新{0}".format(info['url']),level=log.INFO) 598 | # self.db["TC58XiaoQuSeed"].update({"url": info['url']}, 599 | # {"$set": {"status": info['status'], 600 | # "ts": info["ts"] 601 | # }}) 602 | # 603 | # def tc58_xiaoqu_update_info(self, info): 604 | # log.msg("update操作======》tc58小区详细页种子更新{0}".format(info['xiaoqu_url']),level=log.INFO) 605 | # self.db["TC58XiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']}, 606 | # {"$set": {"xiaoqu_name_two": info['xiaoqu_name_two'], 607 | # "huan_bi": info["huan_bi"], 608 | # "address":info["address"], 609 | # "build_type":info["build_type"], 610 | # "house_count": info["house_count"], 611 | # "property_type": info["property_type"], 612 | # "property_cost": info["property_cost"], 613 | # "far": info["far"], 614 | # "build_year": info["build_year"], 615 | # "greening_rate": info["greening_rate"], 616 | # "building_foot_print": info["building_foot_print"], 617 | # "building_area": info["building_area"], 618 | # "property_company": info["property_company"], 619 | # "developer": info["developer"], 620 | # "xiaoqu_id": info["xiaoqu_id"], 621 | # "trend": info["trend"], 622 | # "latitude":info["latitude"], 623 | # "longitude":info["longitude"], 624 | # "ts": info["ts"], 625 | # "status": info["status"] 626 | # }},True) 627 | # 628 | # def tc58_xiaoqu_find_seed1(self): 629 | # ''' 630 | # tc58小区列表页种子提取 631 | # :return: 632 | # ''' 633 | # print("finid操作======》查询链家小区列表页种子") 634 | # return self.db["TC58XiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 635 | # 636 | # def tc58_xiaoqu_find_seed2(self): 637 | # ''' 638 | # tc58小区详细信息种子提取 639 | # :return: 640 | # ''' 641 | # print("finid操作======》查询链家详细信息页种子") 642 | # return self.db["TC58XiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0}) 643 | # 644 | # 645 | # def tc58_personfang_update_seed(self, info): 646 | # log.msg("update操作======》个人房源列表页种子更新{0}".format(info['url']),level=log.INFO) 647 | # self.db["TC58PersonFangSeed"].update({"url": info['url']}, 648 | # {"$set": {"status": info['status'], 649 | # "ts": info["ts"] 650 | # }},True) 651 | # 652 | # def tc58_personfang_update_info(self, info): 653 | # log.msg("update操作======》tc58个人房源种子更新{0}".format(info['ershoufang_url']),level=log.INFO) 654 | # self.db["TC58PersonFangInfo"].update({"ershoufang_url": info['ershoufang_url']}, 655 | # {"$set": {"publish_date": info['publish_date'], 656 | # "total_price": info["total_price"], 657 | # "unit_price": info['unit_price'], 658 | # "xiaoqu_name_two": info["xiaoqu_name_two"], 659 | # "floor": info["floor"], 660 | # "layout": info["layout"], 661 | # "decoration": info["decoration"], 662 | # "area": info["area"], 663 | # "right_year": info["right_year"], 664 | # "orientation": info["orientation"], 665 | # "build_year": info["build_year"], 666 | # "buy_point": info["buy_point"], 667 | # "house_use": info["house_use"], 668 | # "transaction_attr": info["transaction_attr"], 669 | # "status": info["status"], 670 | # "ts": info["ts"] 671 | # }},True) 672 | # 673 | # def tc58_personfang_find_seed(self): 674 | # ''' 675 | # tc58小区详细信息种子提取 676 | # :return: 677 | # ''' 678 | # print("find操作======》查询tc58详细信息页种子") 679 | # return self.db["TC58PersonFangInfo"].find({"status": 0}, {"ershoufang_url": 1, "_id": 0}) 680 | # 681 | # def tc58_xiaoqu_list_insert_seed(self, seed): 682 | # ''' 683 | # 小区列表页种子入库 684 | # :param seed: 685 | # :return: 686 | # ''' 687 | # print("insert操作======》小区信息") 688 | # return self.db["TC58XiaoQu_list"].insert(seed) 689 | # 690 | # 691 | # #######################二手房 692 | # 693 | # def tc58_ershoufang_find_seed_from_xiaoqu(self): 694 | # return self.db["TC58XiaoQu"].find({"status": 0}, {"erShouFangUrl": 1, "_id": 0}).limit(50) 695 | # 696 | # def tc58_ershoufang_insert_seed(self, seed): 697 | # return self.db["TC58ErShouFangSeed"].insert_many(seed) 698 | # 699 | # def tc58_ershoufang_insert_info(self, info): 700 | # return self.db["TC58ErShouFangInfo"].insert_one({"url":info["url"], 701 | # "title" : info["title"], 702 | # "address": info["address"], 703 | # "totalPrice": info["totalPrice"], 704 | # "unitPrice": info["unitPrice"], 705 | # "area":info["area"], 706 | # "status":0}) 707 | # 708 | # 709 | # def tc58_ershoufang_find_seed(self): 710 | # return self.db["TC58ErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 711 | # 712 | # ####################租房 713 | # 714 | # def tc58_zufang_find_seed_from_xiaoqu(self): 715 | # return self.db["TC58XiaoQu"].find({"status": 0}, {"zuFangUrl": 1, "_id": 0}).limit(50) 716 | # 717 | # def tc58_zufang_find_seed(self): 718 | # return self.db["TC58ZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 719 | # 720 | # def tc58_zufang_insert_seed(self, seed): 721 | # return self.db["TC58ZuFangSeed"].insert_many(seed) 722 | # 723 | # def tc58_zufang_insert_info(self, info): 724 | # return self.db["TC58ZuFangInfo"].insert_one({"url":info["url"], 725 | # "title" : info["title"], 726 | # "address": info["address"], 727 | # "unitPrice": info["unitPrice"], 728 | # "status":0}) 729 | # ### 麦田小区信息 ### 730 | # 731 | # def maitian_xiaoqu_insert_seed(self, seed): 732 | # ''' 733 | # 生成小区名称及相应的url 734 | # ''' 735 | # print("insert操作======》麦田小区url") 736 | # return self.db["maitianXiaoQuSeed"].insert(seed) 737 | # 738 | # def maitian_xiaqu_url(self): 739 | # return self.db["maitianXiaoQuSeed"].find({}).limit(1400) 740 | # 741 | # def maitian_xiaoqu_insert(self, data): 742 | # return self.db["maitian_xiaoqu_info"].insert(data) 743 | # 744 | # 745 | # 746 | # ################################房天下########################################################## 747 | # #####房天下小区 748 | # def fang_xiaoqu_insert_seed(self, seed): 749 | # ''' 750 | # 小区列表页种子入库 751 | # :param seed: 752 | # :return: 753 | # ''' 754 | # log.msg("[{0} line:{1}] insert FangXiaoQuSeed=====>>>>>插入Fang天下小区url".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 755 | # return self.db["FangXiaoQuSeed"].insert(seed) 756 | # 757 | # def fang_xiaoqu_find_seed1(self): 758 | # return self.db["FangXiaoQuSeed"].find({"status": 0}).limit(50) 759 | # 760 | # def fang_xiaoqu_find_seed2(self): 761 | # return self.db["FangXiaoQuInfo"].find({"status": 0},{"xiaoqu_url": 1, "_id": 0}).limit(50) 762 | # 763 | # def fang_xiaoqu_update_seed(self, info): 764 | # log.msg("[{0} line:{1}] update FangXiaoQuSeed=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 765 | # self.db["FangXiaoQuSeed"].update({"url": info['url']}, 766 | # {"$set": {"status": info['status'], 767 | # "ts": info["ts"] 768 | # }},True) 769 | # 770 | # def fang_xiaoqu_update_info(self, info): 771 | # log.msg("[{0} line:{1}] update FangXiaoQuInfo=====>>>>>Fang天下小区详细页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 772 | # self.db["FangXiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']}, 773 | # {"$set": {"xiaoqu_name_two": info['xiaoqu_name_two'], 774 | # "sale_url": info["sale_url"], 775 | # "chengjiao_url":info["chengjiao_url"], 776 | # "rent_url":info["rent_url"], 777 | # "sale_total": info["sale_total"], 778 | # "chengjiao_total": info["chengjiao_total"], 779 | # "rent_total": info["rent_total"], 780 | # "xiaoqu_id": info["xiaoqu_id"], 781 | # "region_two": info["region_two"], 782 | # "region_three": info["region_three"], 783 | # "longitude": info["longitude"], 784 | # "latitude": info["latitude"], 785 | # "trend": info["trend"], 786 | # "unit_price": info["unit_price"], 787 | # "huan_bi": info["huan_bi"], 788 | # "tong_bi": info["tong_bi"], 789 | # "address": info["address"], 790 | # "property_type": info["property_type"], 791 | # "build_year": info["build_year"], 792 | # "developer": info["developer"], 793 | # "build_type": info["build_type"], 794 | # "building_area": info["building_area"], 795 | # "building_foot_print": info["building_foot_print"], 796 | # "property_company": info["property_company"], 797 | # "greening_rate": info["greening_rate"], 798 | # "far": info["far"], 799 | # "property_cost":info["property_cost"], 800 | # "follow":info["follow"], 801 | # "imgs":info["imgs"], 802 | # "layout_imgs": info["layout_imgs"], 803 | # "status": info["status"], 804 | # "ts": info["ts"] 805 | # }},True) 806 | # ####房天下二手房 807 | # def fang_ershoufang_insert_seed(self, seed): 808 | # ''' 809 | # 房天下二手房列表页种子入库 810 | # :param seed: 811 | # :return: 812 | # ''' 813 | # log.msg("[{0} line:{1}] insert FangErShouFangSeed=====>>>>>房天下二手房url".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 814 | # return self.db["FangErShouFangSeed"].insert(seed) 815 | # 816 | # def fang_ershoufang_find_seed(self): 817 | # return self.db["FangXiaoQuInfo"].find({"$and":[{"sale_url":{"$ne":None}},{"sale_url":""},{"status": 1}]},{"xiaoqu_url": 1, "_id": 0}) 818 | # 819 | # def fang_ershoufang_info(self): 820 | # return self.db["FangErShouFangInfo"].find({"status": 1,"zf_jjname": "业主"},{"formUrl": 1, "_id": 0}) 821 | # 822 | # def fang_ershoufang_info_update(self, info): 823 | # log.msg("update操作======》Fang天下二手房小区名称", level=log.INFO) 824 | # self.db["FangErShouFangInfo"].update_many({"formUrl": info['formUrl']}, 825 | # {"$set": {"xiaoquName": info['xiaoquName'], 826 | # "status":info["status"] 827 | # }}, True) 828 | # 829 | # def fang_ershoufang_find_seed1(self): 830 | # return self.db["FangErShouFangSeed"].find({"status": 0}) 831 | # 832 | # def fang_ershoufang_find_seed2(self): 833 | # return self.db["FangErShouFangInfo"].find({"status": 0},{"ershoufang_url": 1, "_id": 0}) 834 | # 835 | # def fang_ershoufang_find_seed3(self): 836 | # return self.db["FangXiaoQuInfo"].find({"$and":[{"sale_url":{"$ne":None}},{"sale_url":{"$ne":""}},{"status": 1}]},{"sale_url": 1, "_id": 0}) 837 | # 838 | # def fang_ershoufang_update_seed(self, info): 839 | # log.msg("update操作======》Fang天下二手房列表页种子更新{0}".format(info['url']),level=log.INFO) 840 | # self.db["FangErShouFangSeed"].update({"url": info['url']}, 841 | # {"$set": {"status": info['status'], 842 | # "ts": info["ts"] 843 | # }},True,True) 844 | # def fang_xiaoqu_ershoufang_update_seed(self, info): 845 | # log.msg("[{0} line:{1}] update FangXiaoQuInfo=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 846 | # self.db["FangXiaoQuInfo"].update({"sale_url": info['sale_url']}, 847 | # {"$set": {"status": info['status'], 848 | # "ts": info["ts"] 849 | # }},True) 850 | # 851 | # def fang_xiaoqu_ershoufang_update_seed2(self, info): 852 | # log.msg("[{0} line:{1}] update FangErShouFangSeed=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 853 | # self.db["FangXiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']}, 854 | # {"$set": {"status": info['status'], 855 | # "ts": info["ts"] 856 | # }},True) 857 | # 858 | # def fang_ershoufang_update_info(self, info): 859 | # log.msg("[{0} line:{1}] update FangErShouFangInfo=====>>>>>Fang天下二手房详细页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 860 | # self.db["FangErShouFangInfo"].update({"ershoufang_url": info['ershoufang_url']}, 861 | # {"$set": {"total_price": info['total_price'], 862 | # "decoration": info["decoration"], 863 | # "floor":info["floor"], 864 | # "layout":info["layout"], 865 | # "build_area": info["build_area"], 866 | # "unit_price": info["unit_price"], 867 | # "orientation": info["orientation"], 868 | # "region_three": info["region_three"], 869 | # "build_year": info["build_year"], 870 | # "has_elevator": info["has_elevator"], 871 | # "house_use": info["house_use"], 872 | # "build_structure": info["build_structure"], 873 | # "build_type": info["build_type"], 874 | # "publish_date": info["publish_date"], 875 | # "buy_point": info["buy_point"], 876 | # "fzzj": info["fzzj"], 877 | # "xiaoqu_instru": info["xiaoqu_instru"], 878 | # "ye_zhu": info["ye_zhu"], 879 | # "status":info["status"], 880 | # "ts": info["ts"] 881 | # }},True) 882 | # 883 | # 884 | # ######################################中原房产################################ 885 | # ###小区 886 | # def zhongyuan_xiaoqu_insert_seed(self, seed): 887 | # ''' 888 | # 小区列表页种子入库 889 | # :param seed: 890 | # :return: 891 | # ''' 892 | # log.msg("[{0} line:{1}] insert ZhongYuanXiaoQuSeed =====>>>>> 中原房产小区列表页种子入库".format(self.__class__.__name__,sys._getframe().f_lineno),level=log.INFO) 893 | # return self.db["ZhongYuanXiaoQuSeed"].insert(seed) 894 | # 895 | # def zhongyuan_xiaoqu_update_seed(self, seed): 896 | # ''' 897 | # 更新小区列表页种子状态 898 | # :param seed: 899 | # :return: 900 | # ''' 901 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuSeed =====>>>>> 更新中原房产列表页种子".format(self.__class__.__name__,sys._getframe().f_lineno),level=log.INFO) 902 | # return self.db["ZhongYuanXiaoQuSeed"].update_one({"url":seed["url"]}, 903 | # {"$set":{"status":seed["status"], 904 | # "ts":seed["ts"]}}) 905 | # 906 | # def zhongyuan_xiaoqu_update_info(self, info): 907 | # ''' 908 | # 更新中原详细页种子状态和详细信息 909 | # :param info: 910 | # :return: 911 | # ''' 912 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuInfo=====>>>>>更新家中原地产详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 913 | # return self.db["ZhongYuanXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]}, 914 | # {"$set":{"xiaoqu_name_other": info["xiaoqu_name_other"], 915 | # "address": info['address'], 916 | # "region_three": info['region_three'], 917 | # "region_four" : info["region_four"], 918 | # "property_type": info["property_type"], 919 | # "build_year": info["build_year"], 920 | # "property_cost" : info["property_cost"], 921 | # "property_company": info["property_company"], 922 | # "developer": info["developer"], 923 | # "far": info["far"], 924 | # "greening_rate": info["greening_rate"], 925 | # "sale_url":info["sale_url"], 926 | # "rent_url":info["rent_url"], 927 | # "chengjiao_url":info["chengjiao_url"], 928 | # "latitude":info["latitude"], 929 | # "longitude": info["longitude"], 930 | # "ts":info["ts"], 931 | # "status":info["status"], 932 | # "imgs": info["imgs"], 933 | # "trend": info["trend"], 934 | # "html": info["html"], 935 | # "xiaoqu_id":info["xiaoqu_id"] 936 | # }},True) 937 | # 938 | # def zhongyuan_xiaoqu_find_seed1(self): 939 | # ''' 940 | # 中原地产小区列表页种子提取 941 | # :return: 942 | # ''' 943 | # print("finid操作======》查询中原地产小区列表页种子") 944 | # return self.db["ZhongYuanXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0}) 945 | # 946 | # def zhongyuan_xiaoqu_find_seed2(self): 947 | # ''' 948 | # 中原小区详细信息种子提取 949 | # :return: 950 | # ''' 951 | # print("finid操作======》查询中原地产详细信息页种子") 952 | # return self.db["ZhongYuanXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0}) 953 | # 954 | # def zhongyuan_chengjiaofang_find_seed(self): 955 | # ''' 956 | # 中原小区交易种子提取 957 | # :return: 958 | # ''' 959 | # return self.db["ZhongYuanXiaoQuInfo"].find({"status": 1,"chengjiao_url":{"$ne":""}}, {"chengjiao_url": 1, "_id": 0}) 960 | # 961 | # def zhongyuan_xiaoqu_update_chengjiao_seed(self, info): 962 | # ''' 963 | # 中原小区 964 | # :param info: 965 | # :return: 966 | # ''' 967 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuInfo=====>>>>>更新链家小区成交房种子状态 status=2".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO) 968 | # self.db["ZhongYuanXiaoQuInfo"].update({"chengjiao_url": info['chengjiao_url']}, 969 | # {"$set":{"status": info["status"], 970 | # "ts":info["ts"] 971 | # }} ,True) 972 | # 973 | # 974 | # ###############################赶集网############################################### 975 | # def ganji_ershoufang_insert_seed(self, seed): 976 | # return self.db["GanJiErShouFangSeed"].insert_many(seed) 977 | # 978 | # def ganji_ershoufang_find_seed(self): 979 | # return self.db["GanJiErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 980 | # 981 | # def ganji_ershoufang_insert_info(self, info): 982 | # return self.db["GanJiErShouFangInfo"].insert_one({"url":info["url"], 983 | # "title" : info["title"], 984 | # "layout": info["layout"], 985 | # "area": info["area"], 986 | # "chaoXiang": info["chaoXiang"], 987 | # "floor": info["floor"], 988 | # "decoration": info["decoration"], 989 | # "xiaoquName": info["xiaoquName"], 990 | # "xiaoquUrl": info["xiaoquUrl"], 991 | # "address": info["address"], 992 | # "unitPrice": info["unitPrice"], 993 | # "totalPrice": info["totalPrice"], 994 | # "status":0}) 995 | # 996 | # def ganji_zufang_insert_seed(self, seed): 997 | # return self.db["GanJiZuFangSeed"].insert_many(seed) 998 | # 999 | # def ganji_zufang_find_seed(self): 1000 | # return self.db["GanJiZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50) 1001 | # 1002 | # def ganji_zufang_insert_info(self, info): 1003 | # return self.db["GanJiZuFangInfo"].insert_one({"url":info["url"], 1004 | # "title" : info["title"], 1005 | # "layout": info["layout"], 1006 | # "area": info["area"], 1007 | # "chaoXiang": info["chaoXiang"], 1008 | # "floor": info["floor"], 1009 | # "decoration": info["decoration"], 1010 | # "xiaoquName": info["xiaoquName"], 1011 | # "xiaoquUrl": info["xiaoquUrl"], 1012 | # "address": info["address"], 1013 | # "unitPrice": info["unitPrice"], 1014 | # "leasehold":info["leasehold"], 1015 | # "publishDate":info["publishDate"], 1016 | # "status":0}) 1017 | 1018 | ##########img################################################ 1019 | class ImgDownloadPipeline(ImagesPipeline): 1020 | #yeild 调用下载 1021 | def get_media_requests(self, item, info): 1022 | for image_url in item['image_urls']: 1023 | yield scrapy.Request(image_url) 1024 | 1025 | def item_completed(self, results, item, info): 1026 | image_paths = [x['path'] for ok, x in results if ok] 1027 | if not image_paths: 1028 | raise DropItem("Item contains no images") 1029 | item['image_paths'] = image_paths 1030 | return item 1031 | 1032 | #########redis############################################# 1033 | class RedisPipeline(object): 1034 | 1035 | def __init__(self): 1036 | if not hasattr(RedisPipeline, 'pool'): 1037 | RedisPipeline.create_pool() 1038 | self._connection = redis.Redis(connection_pool=RedisPipeline.pool) 1039 | 1040 | @staticmethod 1041 | def create_pool(): 1042 | RedisPipeline.pool = redis.ConnectionPool( 1043 | host="127.0.0.1", 1044 | port=6379, 1045 | db=0) 1046 | 1047 | def set_lianjia_seed(self, key, value): 1048 | '''''set data with (key, value) 1049 | ''' 1050 | return self._connection.lpush(key, value) 1051 | 1052 | def set_seed(self, key, value): 1053 | '''''set data with (key, value) 1054 | ''' 1055 | return self._connection.lpush(key, value) 1056 | 1057 | def list_len(self,key): 1058 | ''' 1059 | 获取长度 1060 | :return: 1061 | ''' 1062 | return self._connection.llen(key) 1063 | 1064 | def delete_key(self,key): 1065 | return self._connection.delete(key) 1066 | 1067 | 1068 | 1069 | ####mysql###################################################### 1070 | class MysqlPipline(object): 1071 | # 定义静态变量实例 1072 | __instance = None 1073 | 1074 | def __init__(self): 1075 | pass 1076 | 1077 | def __new__(cls, *args, **kwargs): 1078 | if not cls.__instance: 1079 | try: 1080 | Lock.acquire() 1081 | # double check 1082 | if not cls.__instance: 1083 | cls.conn = pymysql.connect(host=settings['MYSQL_HOST'], 1084 | port=settings['MYSQL_PORT'], 1085 | user=settings['MYSQL_USER'], 1086 | passwd=settings['MYSQL_PASSWD'], 1087 | db=settings['MYSQL_DB']) 1088 | cls.cursor = cls.conn.cursor() 1089 | cls.__instance = super().__new__(cls, *args, **kwargs) 1090 | finally: 1091 | Lock.release() 1092 | return cls.__instance 1093 | 1094 | # 使用twisted将mysql插入变成异步执行 1095 | def process_item(self, item, spider): 1096 | pass 1097 | 1098 | def close(self): 1099 | self.cursor.close() 1100 | self.conn.close() 1101 | 1102 | def handle_error(self, failure, item, spider): 1103 | #处理异步插入的异常 1104 | print (failure) 1105 | 1106 | def excute_sql(self,sql): 1107 | try: 1108 | logger.info(f"excute_sql===>>> {sql}") 1109 | self.cursor.execute(sql) 1110 | self.conn.commit() 1111 | except Exception as e: 1112 | if "Duplicate" not in str(e): 1113 | self.conn.rollback() 1114 | 1115 | 1116 | # excel 1117 | # class ExcelPipeline(object): 1118 | # def __init__(self): 1119 | # self.wb = Workbook() 1120 | # self.ws = self.wb.active 1121 | # self.ws.append(['文章url', '文章title', '文章发布时间', '文章内容', '文章作者连接', '文章作者', '文章评论数量']) # 设置表头 1122 | # 1123 | # self.wb2 = Workbook() 1124 | # self.ws2 = self.wb2.active 1125 | # self.ws2.append(['文章url', '评论人', '评论时间', '评论内容', '评论给那一条', '评论给谁']) # 设置表头 1126 | # 1127 | # def process_item(self, item, spider): 1128 | # collection_name = item.__class__.__name__ 1129 | # if collection_name == "DouBanItem": 1130 | # line = [item['article_url'], item['article_title'], item['article_publish_date'], item['article_content'] 1131 | # , item['article_author_url'], item['article_author_name'], 1132 | # item['article_comment_quantity']] # 把数据中每一项整理出来 1133 | # self.ws.append(line) # 将数据以行的形式添加到xlsx中 1134 | # self.wb.save('content.xlsx') # 保存xlsx文件 1135 | # return item 1136 | # if collection_name == "CommentItem": 1137 | # line = [item['article_url'], item['comment_people'], item['comment_time'], item['comment_content'] 1138 | # , item['comment_to_which_coment'], item['comment_to_Who']] # 把数据中每一项整理出来 1139 | # self.ws2.append(line) # 将数据以行的形式添加到xlsx中 1140 | # self.wb2.save('comment.xlsx') # 保存xlsx文件 1141 | # return item 1142 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dz_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dz_spider' 13 | 14 | SPIDER_MODULES = ['dz_spider.spiders'] 15 | NEWSPIDER_MODULE = 'dz_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dz_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 2 31 | # 配合使用 32 | DOWNLOAD_TIMEOUT = 30 33 | 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | # DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | # } 49 | 50 | # Enable or disable spider middlewares 51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'dz_spider.middlewares.DzSpiderSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 58 | DOWNLOADER_MIDDLEWARES = { 59 | 'dz_spider.middlewares.RotateUserAgentMiddleware':400, 60 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware':500 61 | } 62 | 63 | # Enable or disable extensions 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 71 | #数据保存 72 | ITEM_PIPELINES = { 73 | #'house.pipelines.ImgDownloadPipeline': 100, 74 | # 'dz_spider.pipelines.MongoPipeline':2, 75 | 'dz_spider.pipelines.MysqlPipline':3, 76 | #'house.pipelines.ExcelPipeline':1, 77 | } 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | 100 | 101 | # MongoDB数据连接 102 | MONGO_URI="mongodb://127.0.0.1:27017/" 103 | MONGO_DATABASE="house" 104 | # mysql数据库连接 105 | MYSQL_HOST="127.0.0.1" 106 | MYSQL_PROT="3306" 107 | MYSQL_DB="yuqing_db" 108 | MYSQL_USER="root" 109 | MYSQL_PASSWD="lang1994" 110 | 111 | 112 | LOG_LEVEL = "DEBUG" 113 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/__pycache__/baidu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/baidu.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/__pycache__/sogou.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/sogou.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/__pycache__/toutiao.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/toutiao.cpython-36.pyc -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/baidu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import json 4 | import scrapy 5 | import requests 6 | import urllib.parse 7 | from pipelines import MysqlPipline 8 | from scrapy.log import logger 9 | 10 | 11 | class BaiduSpider(scrapy.Spider): 12 | name = 'baidu' 13 | allowed_domains = ['www.baidu.com'] 14 | start_urls = ['https://www.baidu.com/s?wd=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&oq=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&ie=utf-8&rsv_pq=f23a79aa000d332f&rsv_t=7059xeWb4ls1KKoJ0h16REkV2j9830xUMMrCpERps%2BBRpST5YFJuXbPeYuo'] 15 | mysql = MysqlPipline() 16 | 17 | 18 | def start_requests(self): 19 | for url in self.start_urls: 20 | for page in range(9): 21 | yield scrapy.Request(url=f"{url}&pn={page}0") 22 | 23 | def parse(self, response): 24 | t1=time.time() 25 | html=scrapy.Selector(text=response.text) 26 | divs=html.css("#content_left > div .f13 .c-tools::attr(data-tools)") 27 | for div in divs: 28 | data_str=div.extract() 29 | data_dict=json.loads(data_str) 30 | url=None 31 | try: 32 | url=requests.get(data_dict['url'],timeout=5).url 33 | schame = urllib.parse.urlparse(url).netloc 34 | sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)" 35 | self.mysql.excute_sql(sql) 36 | except Exception as e: 37 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}") 38 | t2=time.time() 39 | logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}") -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/sogou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import json 4 | import scrapy 5 | import requests 6 | import urllib.parse 7 | from dz_spider.pipelines import MysqlPipline 8 | from scrapy.log import logger 9 | 10 | 11 | class SogouSpider(scrapy.Spider): 12 | name = 'sogou' 13 | allowed_domains = ['www.sogou.com'] 14 | start_urls = ['https://www.sogou.com/tx?query=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&hdq=sogou-wsse-3f7bcd0b3ea82268&duppid=1&cid=&s_from=result_up&sut=1606&sst0=1565226159021&lkt=0,0,0&sugsuv=006E51C00177C3995CB434B8FE950363&sugtime=1565226159021&ie=utf8&w=01029901&dr=1'] 15 | mysql = MysqlPipline() 16 | 17 | 18 | def start_requests(self): 19 | for url in self.start_urls: 20 | for page in range(0,3): 21 | yield scrapy.Request(url=f"{url}&page={page}") 22 | 23 | def parse(self, response): 24 | t1=time.time() 25 | html=scrapy.Selector(text=response.text) 26 | divs=html.css("div.results > div") 27 | for div in divs: 28 | vrwrap=div.css("div.vrwrap") 29 | if len(vrwrap)==0: 30 | title = "".join(div.css("div.rb h3 a::text").extract()) 31 | url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0] 32 | else: 33 | title="".join(div.css("div.vrwrap h3 a::text").extract()) 34 | url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0] 35 | try: 36 | _html=scrapy.Selector(text=requests.get(url,verify=False).text) 37 | url = _html.re("window.location.replace$\"(.*?)\"$")[0] 38 | schame = urllib.parse.urlparse(url).netloc 39 | sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)" 40 | self.mysql.excute_sql(sql) 41 | except Exception as e: 42 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}") 43 | t2=time.time() 44 | logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}") 45 | -------------------------------------------------------------------------------- /dz_spider/dz_spider/spiders/toutiao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import json 4 | import scrapy 5 | import urllib.parse 6 | from pipelines import MysqlPipline 7 | from scrapy.log import logger 8 | from selenium import webdriver 9 | 10 | 11 | class ToutiaoSpider(scrapy.Spider): 12 | """ 13 | 烦人的cookie 直接用driver 14 | """ 15 | 16 | name = 'toutiao' 17 | allowed_domains = ['www.toutiao.com'] 18 | start_urls = ['https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&format=json&keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis'] 19 | mysql = MysqlPipline() 20 | 21 | 22 | def start_requests(self): 23 | driver=webdriver.Chrome(executable_path="/Users/yuanlang/work/javascript/chromedriver") 24 | driver.get("https://www.toutiao.com/search/?keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6") 25 | time.sleep(2) 26 | for url in self.start_urls: 27 | for page in range(0,8): 28 | driver.get(url=f"{url}&offset={20*page}×tamp={'%d'%(time.time()*1000)}") 29 | time.sleep(5) 30 | html=scrapy.Selector(text=driver.page_source) 31 | content=html.css("body > pre::text").extract_first() 32 | data=json.loads(content)["data"] 33 | for item in data: 34 | try: 35 | if "article_url" not in item: 36 | if "display" not in item: 37 | print(item) 38 | continue 39 | print(item["display"]) 40 | _url = item["display"]["info"]["url"] 41 | title = item["display"]["emphasized"]["title"] 42 | else: 43 | title = item["abstract"] 44 | _url = item["article_url"] 45 | schame = urllib.parse.urlparse(_url).netloc 46 | sql = f"insert into seed(url,title,site_name,type) values('{_url}','{title}','{schame}',1)" 47 | self.mysql.excute_sql(sql) 48 | except Exception as e: 49 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}") 50 | 51 | # time.sleep(6000) 52 | 53 | def parse(self, response): 54 | pass 55 | 56 | -------------------------------------------------------------------------------- /dz_spider/log/app.log: -------------------------------------------------------------------------------- 1 | [1;37m2019-07-26 19:53:26,076 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m 2 | [1;37m2019-07-26 19:53:28,387 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html','四川达州地陷2名遇难者系年轻夫妻国庆刚办宴席_网易新闻','news.163.com')[0m 3 | [1;37m2019-07-26 19:53:29,603 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://ah.people.com.cn/n2/2018/1008/c358314-32129820.html','四川达州地面塌陷造成至少一人死亡--安徽频道--人民网 ','ah.people.com.cn')[0m 4 | [1;37m2019-07-26 19:53:31,895 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://wemedia.ifeng.com/81218105/wemedia.shtml','最新!达州路面塌陷已发现两名被困者,其中一人抢救无效死亡','wemedia.ifeng.com')[0m 5 | [1;37m2019-07-26 19:53:33,591 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://blog.sina.com.cn/s/blog_5f7396520102z29j.html','达州路面塌陷事件:要追究刑事责任人_丁金坤_新浪博客','blog.sina.com.cn')[0m 6 | [1;37m2019-07-26 19:53:35,960 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://cnahrx.com/a2188-320779-0.shtml','[达川区]达州南外一人行路面发生塌陷官方发布险情通报_达州今日...','cnahrx.com')[0m 7 | [1;37m2019-07-26 19:53:36,888 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.sina.com.cn/s/2018-10-14/doc-ihmhafir5676446.shtml','四川达州涵洞塌陷区域回填结束事件原因正在调查|涵洞|..._新浪新闻','news.sina.com.cn')[0m 8 | [1;37m2019-07-26 19:53:38,456 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/ggxw/201810/56571800.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现_..._四川在线','sichuan.scol.com.cn')[0m 9 | [1;37m2019-07-26 19:53:39,057 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://society.huanqiu.com/photonew/2018-10/2905788.html','四川达州一人行道突然塌陷有人员掉落_社会_环球网','society.huanqiu.com')[0m 10 | [1;37m2019-07-26 19:53:39,644 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cngold.org/gundong/2018-10-09/c6009808.html','人行道路面塌陷事故疑似造成4名行人陷落-滚动新闻-金投热点网-金...','news.cngold.org')[0m 11 | [1;37m2019-07-26 19:54:55,739 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sc.gov.cn/10462/10464/13722/2018/10/8/10460284.shtml','达州南外一人行路面发生塌陷官方发布险情通报- 四川省人民政府','www.sc.gov.cn')[0m 12 | [1;37m2019-07-26 19:58:53,120 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m 13 | [1;37m2019-07-26 19:58:55,254 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://cnahrx.com/a2188-320779-0.shtml','[达川区]达州南外一人行路面发生塌陷官方发布险情通报_达州今日...','cnahrx.com')[0m 14 | [1;37m2019-07-26 19:58:55,706 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_1855024094_6e916bde02000dz07.html?cre=tianyi&mod=pcpager_news&loc=17&r=9&doct=0&rfunc=100&tj=none&tr=9','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中','k.sina.com.cn')[0m 15 | [1;37m2019-07-26 19:58:56,462 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/ggxw/201810/56571800.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现_..._四川在线','sichuan.scol.com.cn')[0m 16 | [1;37m2019-07-26 19:58:57,918 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/AF7ACB92C3T658683.shtml','人行道路面塌陷多人死亡令人惋惜_医生在线','news.51daifu.com')[0m 17 | [1;37m2019-07-26 19:58:58,453 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cctv.com/2018/10/09/ARTI32aZet0kONxAyM8i1jIZ181009.shtml','达州塌陷事故中的遇难夫妻 4天前刚举行婚礼_新闻频道_央视网(cctv...','news.cctv.com')[0m 18 | [1;37m2019-07-26 19:58:59,616 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.520zg.net/forum.php?mod=viewthread&tid=1239730','人行道突然塌陷,90后新婚夫妻遇难!出事前刚买完喜糖 - 龙都茶坊 -...','bbs.520zg.net')[0m 19 | [1;37m2019-07-26 19:59:00,147 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.xishu365.com/thread-484836-1-1.html','四川达州一人行道路面塌陷,四名大人一名小孩掉入坑内,情况有点不...','bbs.xishu365.com')[0m 20 | [1;37m2019-07-26 19:59:00,751 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.thepaper.cn/newsDetail_forward_2508912','父子俩仍被埋!达州路面塌陷疑似4人陷落,已致2人死亡','m.thepaper.cn')[0m 21 | [1;37m2019-07-26 19:59:01,413 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m 22 | [1;37m2019-07-26 20:00:17,396 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sc.gov.cn/10462/10464/13722/2018/10/8/10460284.shtml','达州南外一人行路面发生塌陷官方发布险情通报- 四川省人民政府','www.sc.gov.cn')[0m 23 | [1;37m2019-07-26 20:00:19,674 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817853-1-1.html','达州南外地陷事故救援结束!被困父子已被寻获 - 今日达..._凤凰山下','www.dz19.net')[0m 24 | [1;37m2019-07-26 20:00:20,662 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181007/000912644.html','达州南外一人行路面发生塌陷官方发布险情通报 -四川..._四川新闻网','scnews.newssc.org')[0m 25 | [1;37m2019-07-26 20:00:21,400 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258128561_100144890','达州南外一人行路面塌陷临时交通管制险情通报_达川区','www.sohu.com')[0m 26 | [1;37m2019-07-26 20:00:22,194 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3EOHRD05149V0C.html','达州市达川区人行道塌陷进入临时恢复阶段事件原因仍在..._网易订阅','dy.163.com')[0m 27 | [1;37m2019-07-26 20:00:27,258 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258128561_100144890','达州南外一人行路面塌陷临时交通管制险情通报-警法频道-手机搜狐','m.sohu.com')[0m 28 | [1;37m2019-07-26 20:00:28,516 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181014/000914485.html','达州市达川区人行道塌陷进入临时恢复阶段事件原因仍..._四川新闻网','scnews.newssc.org')[0m 29 | [1;37m2019-07-26 20:00:30,309 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.ifeng.com/a/20181015/6945534_0.shtml','达州市达川区人行道塌陷进入临时恢复阶段事件原因仍在调查中_...','sc.ifeng.com')[0m 30 | [1;37m2019-07-26 20:00:32,353 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/sh/shipin/cns-d/2018/10-07/news788198.shtml','四川达州市达川区一人行道突然塌陷-中新网视频','www.chinanews.com')[0m 31 | [1;37m2019-07-26 20:00:35,831 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dsj365.cn/front/article/17733.html','大事记盘点:2018年10月国内热点事件 - 大事记','www.dsj365.cn')[0m 32 | [1;37m2019-07-26 20:00:37,645 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.520zg.net/forum.php?mod=viewthread&tid=1239730','人行道突然塌陷,90后新婚夫妻遇难!出事前刚买完喜糖 - 龙都茶坊 -...','bbs.520zg.net')[0m 33 | [1;37m2019-07-26 20:00:40,940 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.gazx.org/content/2018-10/8/201810815122663654.htm','达州“地陷”是天灾还是人祸?_广安在线','www.gazx.org')[0m 34 | [1;37m2019-07-26 20:00:42,099 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.jinciwei.cn/j447103.html','最新动态!南外人行道塌陷恢复治理工作正在有序进行,赶紧看过来! -...','www.jinciwei.cn')[0m 35 | [1;37m2019-07-26 20:00:43,186 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.nbtv.cn/xwdsg/gn/30067333.shtml','达州人行道路面塌陷事故搜救结束,4名被困者遇难','www.nbtv.cn')[0m 36 | [1;37m2019-07-26 20:00:44,217 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.hainan.net/post-73-653820-1.shtml','这才是真正的坑人,坑死人了,人行道塌陷致一对新婚夫妻遇难_三亚_...','bbs.hainan.net')[0m 37 | [1;37m2019-07-26 20:00:45,580 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dzwww.com/xinwen/shehuixinwen/201810/t20181007_17916739.htm','四川达州一人行路面塌陷附近路段实施临时交通管制_社会新闻_大众网','www.dzwww.com')[0m 38 | [1;37m2019-07-26 20:00:46,469 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.kandianla.com/kandian/6322.html','人行道路面塌陷怎么回事真是太危险了_看点啦','www.kandianla.com')[0m 39 | [1;37m2019-07-26 20:00:49,371 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://zhaopin.baidu.com/m/company?query=4d37003a46a1bac4f0593d376d803df3','达州济民医院-企业名片-百度百聘','zhaopin.baidu.com')[0m 40 | [1;37m2019-07-26 20:00:50,778 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.11467.com/qiye/39876980.htm','达州济民医院','www.11467.com')[0m 41 | [1;37m2019-07-26 20:00:51,450 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.bjnews.com.cn/wevideo/2018/10/08/509511.html','四川达州路面塌陷救援继续附近居民暂停水电气供应 - ..._新京报网','www.bjnews.com.cn')[0m 42 | [1;37m2019-07-26 20:00:52,201 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181014164416587.html','达州市达川区人行道塌陷进入临时恢复阶段事件原因仍在调查中_...','mini.eastday.com')[0m 43 | [1;37m2019-07-26 20:00:54,105 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTHPCVH205149V0C.html','达州南外一人行路面发生塌陷官方发布险情通报_网易订阅','dy.163.com')[0m 44 | [1;37m2019-07-26 20:00:54,620 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181007195326849.html','达州南外一人行路面发生塌陷官方发布险情通报_社会频道_东方头条','mini.eastday.com')[0m 45 | [1;37m2019-07-26 20:00:56,524 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.sina.com.cn/c/2018-10-11/doc-ihmhafiq9423734.shtml','四川达州塌陷事故专家组:尚未完全确定地陷原因|地陷|涵..._新浪新闻','news.sina.com.cn')[0m 46 | [1;37m2019-07-26 20:00:57,808 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.sina.com.cn/o/2018-10-08/doc-ihkvrhpt0250131.shtml','四川达州一人行道路面塌陷两被困者抢救无效死亡|路面..._新浪新闻','news.sina.com.cn')[0m 47 | [1;37m2019-07-26 20:00:58,655 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://gas.newssc.org/system/20181008/002523077.html','达州南外一人行路面发生塌陷官方发布险情通报 - 四川新闻网广安...','gas.newssc.org')[0m 48 | [1;37m2019-07-26 20:00:59,622 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dingjinkun.blog.caixin.com/archives/189901','达州路面塌陷事件:要追究刑事责任人-丁金坤-财新博客-新世纪的...','dingjinkun.blog.caixin.com')[0m 49 | [1;37m2019-07-26 20:01:00,602 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/gn/2018/10-08/8644238.shtml','四川达州地面塌陷造成至少一人死亡-中新网','www.chinanews.com')[0m 50 | [1;37m2019-07-26 20:01:02,375 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html#from=relevant','四川达州地陷2名遇难者系年轻夫妻国庆刚办宴席_网易新闻','news.163.com')[0m 51 | [1;37m2019-07-26 20:01:03,545 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!_..._搜狐','www.sohu.com')[0m 52 | [1;37m2019-07-26 20:01:04,608 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.qichacha.com/postnews_19fb84f3c7d4c4036adca34dec5d6107.html','达州南外一人行路面发生塌陷官方发布险情通报 -四川新闻-四川...','www.qichacha.com')[0m 53 | [1;37m2019-07-26 20:01:06,573 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/gn/2018/10-08/8644238.shtml','四川达州地面塌陷造成至少一人死亡-中新网','www.chinanews.com')[0m 54 | [1;37m2019-07-26 20:01:07,058 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dingjinkun.blog.caixin.com/archives/189901','达州路面塌陷事件:要追究刑事责任人-丁金坤-财新博客-新世纪的...','dingjinkun.blog.caixin.com')[0m 55 | [1;37m2019-07-26 20:01:07,982 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258355254_100172646/','一周安全警示:四川塌陷事故2人失踪,定陶塔吊倒塌3人死亡_手机搜狐网','m.sohu.com')[0m 56 | [1;37m2019-07-26 20:01:09,502 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/dzxw/201810/56572825.html','达州地面塌陷吞噬路人后续:隐患排查周边约200户居民被..._四川在线','sichuan.scol.com.cn')[0m 57 | [1;37m2019-07-26 20:01:10,238 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.sina.com.cn/news/m/2018-10-15/detail-ifxeuwws4249029.shtml','达州人行道路面塌陷事件:涵洞塌陷区域回填已结束_新浪四川_新浪网','sc.sina.com.cn')[0m 58 | [1;37m2019-07-26 20:01:11,676 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613672926035544522&wfr=spider&for=pc','四川一人行道路突然塌陷,4名路人瞬间陷落,救援现场二次塌陷','baijiahao.baidu.com')[0m 59 | [1;37m2019-07-26 20:01:12,642 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181009/000913005.html','达州路面塌陷4人陷落2人死亡搜救30多小时一对父子仍..._四川新闻网','scnews.newssc.org')[0m 60 | [1;37m2019-07-26 20:01:13,200 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258432536_99955512','突然陷落! 四川人行道路面塌陷, 共4人遇难, 新婚夫妻和一对父子_...','www.sohu.com')[0m 61 | [1;37m2019-07-26 20:01:14,671 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/dzxw/201810/56572827.html','达州地面塌陷吞噬路人后续:事发现场周边交通管制范围扩..._四川在线','sichuan.scol.com.cn')[0m 62 | [1;37m2019-07-26 20:01:15,362 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258355254_100172646/','一周安全警示:四川塌陷事故2人失踪,定陶塔吊倒塌3人死亡_手机搜狐网','m.sohu.com')[0m 63 | [1;37m2019-07-26 20:01:17,234 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258664216_151037','重磅!2018全国百强县区出炉!我省7个县区上榜,排名最靠前的是..._...','www.sohu.com')[0m 64 | [1;37m2019-07-26 20:01:18,552 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817460-1-1.html','南外济民医院门口突现“天坑”消防,武警正在搜救::: - ..._凤凰山下','www.dz19.net')[0m 65 | [1;37m2019-07-26 20:01:21,682 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://xin.baidu.com/yuqing?yuqingId=dd5f81d78d803062ef973ba7833ed2cb&fl=1&castk=LTE%3D','【达州市达川区人行道塌陷进入临时恢复阶段事件原因仍在调查中-...','xin.baidu.com')[0m 66 | [1;37m2019-07-26 20:01:23,974 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181007/20181007A1HWGS00','【附官方险情通报】达州一人行道路面突然塌陷! 4名路人瞬间坠入,1...','new.qq.com')[0m 67 | [1;37m2019-07-26 20:01:24,643 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/259434578_330146','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中_搜..._搜狐','www.sohu.com')[0m 68 | [1;37m2019-07-26 20:01:25,409 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz169.net/2018/1008/102442.shtml','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_达州网','www.dz169.net')[0m 69 | [1;37m2019-07-26 20:01:26,571 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.gohoedu.com/guo1207227p1p1.html','达州一人行道路面突然塌陷!4名路人瞬间坠入,1名男孩已被救起 - ...','bbs.gohoedu.com')[0m 70 | [1;37m2019-07-26 20:01:27,354 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.ybvv.com/thread-1452995-1-1.html','黑人!四川一人行道路面突然塌陷! 4名路人瞬间坠入,1名男..._零距离','bbs.ybvv.com')[0m 71 | [1;37m2019-07-26 20:01:28,118 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://info.fire.hc360.com/2018/10/1509011101753.shtml','...事件原因正在调查分析中-达州涵洞,塌陷区-消防行业-hc360慧聪网 ','info.fire.hc360.com')[0m 72 | [1;37m2019-07-26 20:01:29,279 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258066494_116237','突发!达州一人行道路面突然塌陷! 4名路人瞬间坠入,1名男孩..._搜狐','m.sohu.com')[0m 73 | [1;37m2019-07-26 20:01:31,328 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258153846_100137571','南城路面塌陷,救援人员争分夺秒不眠不休搜救:今日上午又一..._搜狐','www.sohu.com')[0m 74 | [1;37m2019-07-26 20:01:32,375 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://wemedia.ifeng.com/81315202/wemedia.shtml','【突发】达州人行道塌陷已致两人死亡,现场救援仍在进行!','wemedia.ifeng.com')[0m 75 | [1;37m2019-07-26 20:01:33,471 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://temp.163.com/special/ntes_404/','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_网易订阅','temp.163.com')[0m 76 | [1;37m2019-07-26 20:01:34,302 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTJG4TNI0512EL5Q.html','达州塌陷事故最新进展:再次发现被困者已送往医院救治_网易订阅','dy.163.com')[0m 77 | [1;37m2019-07-26 20:01:34,949 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181007/000912640.html','达州南外一人行路面塌陷附近路段实施临时交通管制 -..._四川新闻网','scnews.newssc.org')[0m 78 | [1;37m2019-07-26 20:01:36,264 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.eastday.com/s/20181008/u1a14284702.html','...进展:又一名被困人员被发现-路面塌陷被困人员事故救援 10月...','news.eastday.com')[0m 79 | [1;37m2019-07-26 20:01:38,222 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181008/20181008A0QJ86.html','达州市达川区人行道塌陷 4名群众陷落2人被搜救','new.qq.com')[0m 80 | [1;37m2019-07-26 20:01:39,310 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.cssqt.com/xw/gn/sp/297837.shtml','四川达州一人行道路面突然塌陷!4名路人瞬间坠入 1名男孩已被救起!...','www.cssqt.com')[0m 81 | [1;37m2019-07-26 20:01:40,317 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m 82 | [1;37m2019-07-26 20:01:41,098 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!-..._搜狐','m.sohu.com')[0m 83 | [1;37m2019-07-26 20:01:43,128 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m 84 | [1;37m2019-07-26 20:01:43,887 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!-..._搜狐','m.sohu.com')[0m 85 | [1;37m2019-07-26 20:01:45,247 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/5BE3E0B7C11T658714.shtml','人行道路面塌陷意外受伤需及时送医治疗_医生在线','news.51daifu.com')[0m 86 | [1;37m2019-07-26 20:01:46,573 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU0GP7QN0522JH3J.html','揪心!造成4人死亡,地面塌陷谁之过?专家回应事故原因_网易订阅','dy.163.com')[0m 87 | [1;37m2019-07-26 20:01:47,931 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_5710586189_15460a14d02000k2ml.html','达川区人行道塌陷恢复治理正有序推进|塌陷|人行道|涵洞_新浪网','k.sina.com.cn')[0m 88 | [1;37m2019-07-26 20:01:49,097 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dzrbs.com/html/2018-10/14/content_331870.htm','达州人行道路面塌陷事件:涵洞塌陷区域已回填,东环南..._达州日报网','www.dzrbs.com')[0m 89 | [1;37m2019-07-26 20:01:49,915 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.sina.com.cn/news/m/2018-10-09/detail-ihkvrhpt2040783.shtml','达州路面塌陷4人陷落2人死亡搜救30多小时一对父子仍失..._新浪四川','sc.sina.com.cn')[0m 90 | [1;37m2019-07-26 20:01:51,099 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.ii119.cn/news/201810/15/52515.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_消防新闻_资讯...','www.ii119.cn')[0m 91 | [1;37m2019-07-26 20:01:52,228 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181008112443666.html','四川达州地面塌陷造成至少一人死亡_社会频道_东方头条','mini.eastday.com')[0m 92 | [1;37m2019-07-26 20:01:52,764 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3Q3I5A0512EL5Q.html','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中_网易订阅','dy.163.com')[0m 93 | [1;37m2019-07-26 20:01:54,401 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613872793220182890&wfr=spider&for=pc','四川达州市达川区路面塌陷被困4人全部遇难','baijiahao.baidu.com')[0m 94 | [1;37m2019-07-26 20:01:55,106 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_5044281310_12ca99fde02000m3d1.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中','k.sina.com.cn')[0m 95 | [1;37m2019-07-26 20:01:56,105 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181014162451012.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_社会频道_东方...','mini.eastday.com')[0m 96 | [1;37m2019-07-26 20:01:57,080 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTO78BUC05371OH7.html','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_网易订阅','dy.163.com')[0m 97 | [1;37m2019-07-26 20:01:58,787 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.dahe.cn/2018/10-08/385850.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现-大河网','news.dahe.cn')[0m 98 | [1;37m2019-07-26 20:01:59,914 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258045254_364195','突发!南城一医院门口人行道塌陷相关部门积极救援-警法频道..._搜狐','m.sohu.com')[0m 99 | [1;37m2019-07-26 20:02:01,079 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://temp.163.com/special/ntes_404/','南城路面塌陷,救援人员争分夺秒不眠不休搜救:今日上午..._网易订阅','temp.163.com')[0m 100 | [1;37m2019-07-26 20:02:01,720 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://ah.people.com.cn/GB/n2/2018/1008/c358314-32129820.html','四川达州地面塌陷造成至少一人死亡--安徽频道--人民网 ','ah.people.com.cn')[0m 101 | [1;37m2019-07-26 20:02:02,508 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.china.com.cn/2018/shizhou_1009/292010.html','达州路面塌陷4人陷落2人死亡搜救30多小时一对父子仍失踪 - 市州...','sc.china.com.cn')[0m 102 | [1;37m2019-07-26 20:02:03,328 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3GED150514R9P4.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_网易订阅','dy.163.com')[0m 103 | [1;37m2019-07-26 20:02:04,298 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.qichacha.com/postnews_19fb84f3c7d4c4036adca34dec5d6107.html','达州南外一人行路面发生塌陷官方发布险情通报 -四川新闻-四川...','www.qichacha.com')[0m 104 | [1;37m2019-07-26 20:02:05,929 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258145605_255783','四川达州地面塌陷造成至少一人死亡_达川区','www.sohu.com')[0m 105 | [1;37m2019-07-26 20:02:06,959 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258751336_364195','大搜救中的4个“关键字”_搜狐警法_搜狐网','www.sohu.com')[0m 106 | [1;37m2019-07-26 20:02:07,982 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1614283709015688151&wfr=spider&for=pc','四川达州人行道路面塌陷区域回填结束事件原因正在调查分析中','baijiahao.baidu.com')[0m 107 | [1;37m2019-07-26 20:02:24,795 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m 108 | [0;31m2019-07-26 20:02:25,040 [ERROR] baidu.py parse[line:36]: requests.get(data_dict['url']).url ===>>> HTTPConnectionPool(host='www.edushi.com', port=80): Max retries exceeded with url: /zixun/info/2-15-n4537771.html (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))[0m 109 | [1;37m2019-07-26 20:03:41,157 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dazhou.gov.cn/articview_20181011102748790.html','达州市达川区人行道路面塌陷灾害搜救回顾 - 达州市人民政府 ','www.dazhou.gov.cn')[0m 110 | [1;37m2019-07-26 20:03:42,389 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613725507012412733&wfr=spider&for=pc','达州南客站对面济民医院门口地面突然塌陷,有人跌落','baijiahao.baidu.com')[0m 111 | [1;37m2019-07-26 20:03:42,844 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.mnw.cn/news/shehui/2068305.html','达州济民医院地面塌陷造成至少一人死亡-闽南网','www.mnw.cn')[0m 112 | [1;37m2019-07-26 20:03:43,378 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://blog.sina.com.cn/s/blog_5f7396520102z29j.html','达州路面塌陷事件:要追究刑事责任人_丁金坤_新浪博客','blog.sina.com.cn')[0m 113 | [1;37m2019-07-26 20:03:44,322 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181010/20181010B1V7KD.html','南外人行道地面塌陷事故救援结束,新婚夫妇和一对父子均遇难','new.qq.com')[0m 114 | [1;37m2019-07-26 20:03:45,417 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html','四川达州地陷2名遇难者系年轻夫妻国庆刚办宴席_网易新闻','news.163.com')[0m 115 | [1;37m2019-07-26 20:03:46,168 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.bjnews.com.cn/wevideo/2018/10/08/509511.html','四川达州路面塌陷救援继续附近居民暂停水电气供应 - ..._新京报网','www.bjnews.com.cn')[0m 116 | [1;37m2019-07-26 20:09:02,380 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m 117 | [1;37m2019-07-26 20:12:48,454 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m 118 | [1;37m2019-07-26 20:14:05,291 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dazhou.gov.cn/articview_20181011102748790.html','达州市达川区人行道路面塌陷灾害搜救回顾 - 达州市人民政府 ','www.dazhou.gov.cn')[0m 119 | [1;37m2019-07-26 20:16:58,045 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.cbda.cn/html/kx/20181010/123371.html','揪心!四川一路面突然塌陷,新婚仅4天的夫妻双亡,一对父..._中装新网','www.cbda.cn')[0m 120 | [1;37m2019-07-26 20:16:59,684 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/AF7ACB92C3T658683.shtml','人行道路面塌陷多人死亡令人惋惜_医生在线','news.51daifu.com')[0m 121 | [1;37m2019-07-26 20:17:00,350 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.crjwz.com/shehui/69073.html','四川达州路面塌陷事故救援进展:两人经抢救无效死亡_今天热点新闻...','news.crjwz.com')[0m 122 | [1;37m2019-07-26 20:17:00,833 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cctv.com/2018/10/09/ARTI32aZet0kONxAyM8i1jIZ181009.shtml','达州塌陷事故中的遇难夫妻 4天前刚举行婚礼_新闻频道_央视网(cctv...','news.cctv.com')[0m 123 | [1;37m2019-07-26 20:17:02,134 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.xishu365.com/thread-484836-1-1.html','四川达州一人行道路面塌陷,四名大人一名小孩掉入坑内,情况有点不...','bbs.xishu365.com')[0m 124 | [1;37m2019-07-26 20:17:05,739 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.zz-qq.com/1/97382.html','人行道路面塌陷_四川达州一人行道路面塌陷两人遇难两人被困- ...','www.zz-qq.com')[0m 125 | [1;37m2019-07-26 20:17:07,208 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.xianjichina.com/news/details_85746.html','达州人行道塌陷的最新进展:两人死亡,施救困难-贤集网资讯','www.xianjichina.com')[0m 126 | [1;37m2019-07-26 20:17:09,077 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.aihami.com/a/dangjian/zugong/375555.html','达州路面塌陷事故抢修工作现正在紧张的进行中_楚秀网','www.aihami.com')[0m 127 | [1;37m2019-07-26 20:17:11,364 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.gazx.org/content/2018-10/8/201810815122663654.htm','达州“地陷”是天灾还是人祸?_广安在线','www.gazx.org')[0m 128 | [1;37m2019-07-26 20:17:12,308 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.jinciwei.cn/j447103.html','最新动态!南外人行道塌陷恢复治理工作正在有序进行,赶紧看过来! -...','www.jinciwei.cn')[0m 129 | [1;37m2019-07-26 20:17:23,054 [DEBUG] baidu.py parse[line:39]: 执行===>>> https://www.baidu.com/s?wd=2018%E5%B9%B410%E6%9C%887%E6%97%A5%E8%BE%BE%E5%B7%9D%E5%8C%BA%E5%8D%97%E5%A4%96%E6%B5%8E%E6%B0%91%E5%8C%BB%E9%99%A2%E9%97%A8%E5%8F%A3%E7%AA%81%E7%84%B6%E5%A1%8C%E9%99%B7%E4%BA%8B%E4%BB%B6&oq=2018%E5%B9%B410%E6%9C%887%E6%97%A5%E8%BE%BE%E5%B7%9D%E5%8C%BA%E5%8D%97%E5%A4%96%E6%B5%8E%E6%B0%91%E5%8C%BB%E9%99%A2%E9%97%A8%E5%8F%A3%E7%AA%81%E7%84%B6%E5%A1%8C%E9%99%B7%E4%BA%8B%E4%BB%B6&ie=utf-8&rsv_idx=1&rsv_pq=da4e0d0600051217&rsv_t=0bdcDWC5g2e2v0%2FFpxTTPC6IQO3RvUQxRCleqWWkBvdvuCKNo6MtAkayKAM&pn=30 花费时间22.64210271835327[0m 130 | -------------------------------------------------------------------------------- /dz_spider/run.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | -------------------------------------- 4 | @describe 5 | @version: 1.0 6 | @project: yuqing_system 7 | @file: run.py 8 | @author: yuanlang 9 | @time: 2019-07-26 17:12 10 | --------------------------------------- 11 | """ 12 | 13 | from scrapy import cmdline 14 | # cmdline.execute(['scrapy', 'crawl', 'baidu']) 15 | cmdline.execute(['scrapy', 'crawl', 'toutiao']) 16 | # cmdline.execute(['scrapy', 'crawl', 'sogou']) 17 | 18 | 19 | -------------------------------------------------------------------------------- /dz_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dz_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dz_spider 12 | -------------------------------------------------------------------------------- /plan: -------------------------------------------------------------------------------- 1 | 1. 2018年10月7日达川区南外济民医院门口突然塌陷事件 2 | 2. 2018年6月1日达州市好一新大火事件 3 | 3. 2018年8月出租车罢工事件 4 | 5 | 6 | 数据源： 7 | 今日头条，百度新闻，sougo，微博 8 | 9 | #达州地陷# 10 | #达州好一新火灾# 11 | 12 | 13 | 14 | 新闻数据关键词提取、主题聚类分析 15 | 评论数据进行关键词提取、主题聚类分析 16 | 17 | url title create_time update_time --------------------------------------------------------------------------------