├── .gitignore ├── .idea ├── .name ├── encodings.xml ├── itjuzi_dis.iml ├── misc.xml ├── modules.xml └── workspace.xml ├── Dockerfile ├── README.md ├── docker-compose.yml ├── itjuzi_dis ├── .gitignore ├── .idea │ ├── .name │ ├── encodings.xml │ ├── itjuzi_dis.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── db_util.cpython-35.pyc │ ├── items.cpython-35.pyc │ ├── middlewares.cpython-35.pyc │ ├── pipelines.cpython-35.pyc │ └── settings.cpython-35.pyc ├── db_util.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── juzi_spider.cpython-35.pyc │ └── juzi_spider.py ├── requirements.txt ├── scrapy.cfg └── spiders.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | itjuzi_dis -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/itjuzi_dis.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 149 | 150 | 151 | 164 | 165 | 166 | 167 | 168 | true 169 | 170 | 171 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 208 | 209 | 210 | 211 | 214 | 215 | 218 | 219 | 220 | 221 | 224 | 225 | 228 | 229 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 271 | 272 | 291 | 292 | 293 | 294 | 295 | 308 | 309 | 322 | 323 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 369 | 370 | 389 | 390 | 411 | 412 | 434 | 435 | 459 | 460 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 1470744258822 495 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 548 | 551 | 552 | 553 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | ENV PATH /usr/local/bin:$PATH 3 | ADD . /code 4 | WORKDIR /code 5 | RUN pip install -r requirements.txt 6 | COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis 7 | CMD /usr/local/bin/scrapy crawl itjuzi_dis 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 简介 2 | 在使用 scrapy 爬取 [IT桔子公司][1]信息,用来进行分析,了解 IT 创业公司的一切情况,之前使用 scrapy 写了一个默认线程是10的单个实例,为了防止被 ban IP 设置了下载的速度,3万多个公司信息爬了1天多才完成,现在想到使用分布式爬虫来提高效率。 3 | 4 | ***[源码githup][2]*** 5 | 6 | ####技术工具:`Python3.5` `scrapy` `scrapy_redis` `redis` `docker1.12` `docker-compose` `Kitematic` `mysql` `SQLAlchemy` 7 | 8 | ## 准备工作 9 | 10 | 1. 安装 `Docker` [点这里][3]去了解、安装; 11 | 2. `pip install scrapy scrapy_redis`; 12 | 13 | ## 代码编写 14 | 15 | 1. 分析页面信息: 16 | 我需要获取的是每一个「公司」的详情页面链接 和 分页按钮链接; 17 | 2. 统一存储获取到的链接,提供给多个 `spider` 爬取; 18 | 3. 多个 `spider` 共享一个 `redis` `list` 中的链接; 19 | 20 | ###目录结构图 21 | ![图片描述][5] 22 | ###juzi_spider.py 23 | ``` 24 | # coding:utf-8 25 | 26 | from bs4 import BeautifulSoup 27 | from scrapy.linkextractors import LinkExtractor 28 | from scrapy.spiders import CrawlSpider, Rule 29 | 30 | from scrapy_redis.spiders import RedisCrawlSpider 31 | from itjuzi_dis.items import CompanyItem 32 | 33 | 34 | class ITjuziSpider(RedisCrawlSpider): 35 | name = 'itjuzi_dis' 36 | allowed_domains = ['itjuzi.com'] 37 | # start_urls = ['http://www.itjuzi.com/company/157'] 38 | redis_key = 'itjuziCrawler:start_urls' 39 | rules = [ 40 | # 获取每一页的链接 41 | Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))), 42 | # 获取每一个公司的详情 43 | Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item') 44 | ] 45 | 46 | def parse_item(self, response): 47 | soup = BeautifulSoup(response.body, 'lxml') 48 | 49 | . 50 | .省略一些处理代码 51 | . 52 | return item 53 | ``` 54 | **说明:** 55 | 1. `class` 继承了`RedisCrawlSpider` 而不是`CrawlSpider` 56 | 2. `start_urls` 改为一个自定义的 `itjuziCrawler:start_urls`,这里的`itjuziCrawler:start_urls` 就是作为所有链接存储到 `redis` 中的 `key`,`scrapy_redis` 里也是通过`redis`的 `lpop`方法弹出并删除链接的; 57 | 58 | ###db_util.py 59 | 使用 `SQLAlchemy` 作为 `ORM` 工具,当表结构不存在时,自动创建表结构 60 | 61 | ###middlewares.py 62 | 增加了很多 `User-Agent`,每一个请求随机使用一个,防止防止网站通过 `User-Agent` 屏蔽爬虫 63 | 64 | ###settings.py 65 | 配置`middlewares.py` `scrapy_redis` `redis` 链接相关信息 66 | 67 | ##部署 68 | 在上面的「目录结构图」中有,`Dockerfile`和`docker-compose.yml` 69 | ### Dockerfile 70 | 71 | ``` 72 | FROM python:3.5 73 | ENV PATH /usr/local/bin:$PATH 74 | ADD . /code 75 | WORKDIR /code 76 | RUN pip install -r requirements.txt 77 | COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis 78 | CMD /usr/local/bin/scrapy crawl itjuzi_dis 79 | 80 | ``` 81 | **说明:** 82 | 83 | - 使用 `python3.5`作为基础镜像 84 | - 将`/usr/local/bin`设置环境变量 85 | - 映射 `host` 和 `container` 的目录 86 | - 安装 `requirements.txt` 87 | - 特别要说明的是`COPY spiders.py /usr/local/lib/python3.5/site-packages/scrapy_redis`,将 `host` 中的 `spiders.py` 拷贝到`container` 中的 `scrapy_redis` 安装目录中,因为 `lpop` 获取`redis` 的值在 `python2`中是 `str` 类型,而在 `python3`中是 `bytes` 类型,这个问题在 `scrapy_reids` 中需要修复,`spiders.py` 第84行需要修改; 88 | - 启动后立即执行爬行命令 `scrapy crawl itjuzi_dis` 89 | 90 | ### docker-compose.yml 91 | ``` 92 | version: '2' 93 | services: 94 | spider: 95 | build: . 96 | volumes: 97 | - .:/code 98 | links: 99 | - redis 100 | depends_on: 101 | - redis 102 | redis: 103 | image: redis 104 | ports: 105 | - "6379:6379" 106 | 107 | ``` 108 | **说明:** 109 | 110 | - 使用第2版本的 `compose` 描述语言 111 | - 定义了 `spider` 和 `redis` 两个 `service` 112 | - `spider`默认使用当前目录的 `Dockerfile` 来创建,`redis`使用 `redis:latest` 镜像创建,并都映射6379端口 113 | 114 | ###开始部署 115 | 116 | **启动 container** 117 | docker-compose up #从 docker-compose.yml 中创建 `container` 们 118 | docker-compose scale spider=4 #将 spider 这一个服务扩展到4个,还是同一个 redis 119 | 120 | 可以在 `Kitematic` GUI 工具中观察创建和运行情况; 121 | 122 | ![图片描述][6] 123 | 124 | 在没有设置 `start_urls` 时,4个 `container` 中的爬虫都处于饥渴的等待状态 125 | 126 | ![图片描述][7] 127 | 128 | 现在给 `redis` 中放入 `start_urls`: 129 | lpush itjuziCrawler:start_urls http://www.itjuzi.com/company 130 | 131 | 4个爬虫都动起来了,一直爬到`start_urls`为空 132 | ![图片描述][8] 133 | 134 | 以上! 135 | 136 | 137 | [1]: http://www.itjuzi.com/company 138 | [2]: https://github.com/caoxiaozh/itjuzi_dis 139 | [3]: https://www.docker.com/products/overview 140 | [4]: https://github.com/caoxiaozh/itjuzi_dis 141 | [5]: https://segmentfault.com/img/bVAlLY 142 | [6]: https://segmentfault.com/img/bVAlSh 143 | [7]: https://segmentfault.com/img/bVAlS7 144 | [8]: https://segmentfault.com/img/bVAlUh 145 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | spider: 4 | build: . 5 | volumes: 6 | - .:/code 7 | links: 8 | - redis 9 | depends_on: 10 | - redis 11 | redis: 12 | image: redis 13 | ports: 14 | - "6379:6379" 15 | -------------------------------------------------------------------------------- /itjuzi_dis/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /itjuzi_dis/.idea/.name: -------------------------------------------------------------------------------- 1 | itjuzi_dis -------------------------------------------------------------------------------- /itjuzi_dis/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /itjuzi_dis/.idea/itjuzi_dis.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /itjuzi_dis/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /itjuzi_dis/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /itjuzi_dis/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 58 | 59 | 60 | 61 | 62 | true 63 | 64 | 65 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 102 | 103 | 104 | 105 | 108 | 109 | 112 | 113 | 114 | 115 | 118 | 119 | 122 | 123 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 166 | 167 | 168 | 169 | 170 | 183 | 184 | 197 | 198 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 244 | 245 | 264 | 265 | 286 | 287 | 309 | 310 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 1470720085616 343 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | -------------------------------------------------------------------------------- /itjuzi_dis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__init__.py -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/db_util.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/db_util.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/middlewares.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/middlewares.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/db_util.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from sqlalchemy import create_engine, Column, Integer, String 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker 6 | 7 | Base = declarative_base() 8 | url = 'mysql+pymysql://root:root@192.168.0.7/spider_tools?charset=utf8' 9 | engine = create_engine(url, echo=False) 10 | 11 | 12 | class DB_Util(object): 13 | @staticmethod 14 | def get_session(url=None): 15 | Session = sessionmaker(bind=engine) 16 | session = Session() 17 | return session 18 | 19 | @staticmethod 20 | def init_db(): 21 | Base.metadata.create_all(engine) 22 | 23 | 24 | class JuziCompany(Base): 25 | __tablename__ = 't_juzi_company' 26 | id = Column(Integer, primary_key=True) 27 | company_name = Column(String(100), nullable=True) 28 | slogan = Column(String(100), nullable=True) 29 | scope = Column(String(30), nullable=True) 30 | sub_scope = Column(String(30), nullable=True) 31 | city = Column(String(30),nullable=True) 32 | area = Column(String(30),nullable=True) 33 | home_page = Column(String(100), nullable=True) 34 | tags = Column(String(200)) 35 | company_intro = Column(String(500), nullable=True) 36 | company_full_name = Column(String(100), nullable=True) 37 | found_time = Column(String(10), nullable=True) 38 | company_size = Column(String(20), nullable=True) 39 | company_status = Column(String(20), nullable=True) 40 | info_id = Column(String(20), nullable=False) 41 | 42 | 43 | class JuziTeam(Base): 44 | __tablename__ = 't_juzi_team' 45 | id = Column(Integer, primary_key=True) 46 | company_id = Column(String(20), nullable=False) 47 | tm_m_name = Column(String(100), nullable=True) 48 | tm_m_title = Column(String(100), nullable=True) 49 | tm_m_intro = Column(String(500), nullable=True) 50 | 51 | 52 | class JuziTz(Base): 53 | __tablename__ = 't_juzi_tz' 54 | company_id = Column(String(20), nullable=False) 55 | id = Column(Integer, primary_key=True) 56 | tz_time = Column(String(100), nullable=True) 57 | tz_round = Column(String(20), nullable=True) 58 | tz_finades = Column(String(100), nullable=True) 59 | tz_capital = Column(String(500), nullable=True) 60 | 61 | 62 | class JuziProduct(Base): 63 | __tablename__ = 't_juzi_product' 64 | company_id = Column(String(20), nullable=False) 65 | id = Column(Integer, primary_key=True) 66 | pdt_name = Column(String(100), nullable=True) 67 | pdt_type = Column(String(100), nullable=True) 68 | pdt_intro = Column(String(500), nullable=True) 69 | -------------------------------------------------------------------------------- /itjuzi_dis/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CompanyItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | info_id = scrapy.Field() 15 | company_name = scrapy.Field() 16 | slogan = scrapy.Field() 17 | scope = scrapy.Field() 18 | sub_scope = scrapy.Field() 19 | city = scrapy.Field() 20 | area = scrapy.Field() 21 | home_page = scrapy.Field() 22 | tags = scrapy.Field() 23 | company_intro = scrapy.Field() 24 | company_full_name = scrapy.Field() 25 | found_time = scrapy.Field() 26 | company_size = scrapy.Field() 27 | company_status = scrapy.Field() 28 | tz_info = scrapy.Field() 29 | tm_info = scrapy.Field() 30 | pdt_info = scrapy.Field() -------------------------------------------------------------------------------- /itjuzi_dis/middlewares.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | import logging 4 | import random 5 | 6 | # Start your middleware class 7 | 8 | from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware 9 | 10 | 11 | class RotateUserAgentMiddleware(UserAgentMiddleware): 12 | def __init__(self, user_agent=''): 13 | self.user_agent = user_agent 14 | 15 | def process_request(self, request, spider): 16 | # 这句话用于随机选择user-agent 17 | ua = random.choice(self.user_agent_list) 18 | if ua: 19 | logging.info(ua) 20 | request.headers.setdefault('User-Agent', ua) 21 | 22 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 23 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 24 | user_agent_list = [ 25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 26 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 27 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 29 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 30 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 31 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 33 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 34 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 35 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 36 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 38 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 39 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 40 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 41 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 42 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 43 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", 44 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", 45 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 46 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", 47 | "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", 48 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", 49 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", 50 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", 51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", 52 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", 53 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 54 | "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", 55 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", 56 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", 57 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", 58 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", 59 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" 60 | ] 61 | 62 | 63 | # class ProxyMiddleware(object): 64 | # # overwrite process request 65 | # def process_request(self, request, spider): 66 | # # Set the location of the proxy 67 | # sql = 'select ip,port from t_proxy_ip t where t.is_valid =1' 68 | # result = SqlUtil.query_all(sql) 69 | # ip_port = random.choice(result) 70 | # logging.info(ip_port) 71 | # request.meta['proxy'] = "http://{0}:{1}".format(ip_port['ip'], ip_port['port']) 72 | # # # Use the following lines if your proxy requires authentication 73 | # # proxy_user_pass = "USERNAME:PASSWORD" 74 | # # # setup basic authentication for the proxy 75 | # # encoded_user_pass = base64.encodestring(proxy_user_pass) 76 | # # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass 77 | -------------------------------------------------------------------------------- /itjuzi_dis/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.exceptions import DropItem 8 | 9 | from itjuzi_dis.db_util import JuziCompany,DB_Util,JuziTeam,JuziTz,JuziProduct 10 | 11 | 12 | # 去重复的 company 13 | class DuplicatesPipeline(object): 14 | 15 | def __init__(self): 16 | self.ids_seen = set() 17 | 18 | def process_item(self, item, spider): 19 | if item['info_id'] in self.ids_seen: 20 | raise DropItem("Duplicate item found: %s" % item) 21 | else: 22 | self.ids_seen.add(item['info_id']) 23 | return item 24 | 25 | 26 | class ItjuziSpiderPipeline(object): 27 | def open_spider(self, spider): 28 | DB_Util.init_db() # 表不存在时候,初始化表结构 29 | 30 | def process_item(self, item, spider): 31 | if not item['info_id']: 32 | raise DropItem('item info_id is null.{0}'.format(item)) 33 | else: 34 | session = DB_Util.get_session() 35 | company = JuziCompany() 36 | company.company_name = item['company_name'] 37 | company.slogan = item['slogan'] 38 | company.scope=item['scope'] 39 | company.sub_scope=item['sub_scope'] 40 | company.city = item['city'] 41 | company.area = item['area'] 42 | company.home_page=item['home_page'] 43 | company.tags=item['tags'] 44 | company.company_intro=item['company_intro'] 45 | company.company_full_name=item['company_full_name'] 46 | company.found_time=item['found_time'] 47 | company.company_size=item['company_size'] 48 | company.company_status=item['company_status'] 49 | company.info_id = item['info_id'] 50 | session.add(company) 51 | if item['tz_info']: 52 | for touzi in item['tz_info']: 53 | tz = JuziTz() 54 | tz.company_id = company.info_id 55 | tz.tz_time = touzi['tz_time'] 56 | tz.tz_finades = touzi['tz_finades'] 57 | tz.tz_capital = touzi['tz_capital'] 58 | tz.tz_round = touzi['tz_round'] 59 | session.add(tz) 60 | if item['tm_info']: 61 | for team in item['tm_info']: 62 | tm = JuziTeam() 63 | tm.company_id = company.info_id 64 | tm.tm_m_name = team['tm_m_name'] 65 | tm.tm_m_title = team['tm_m_title'] 66 | tm.tm_m_intro = team['tm_m_intro'] 67 | session.add(tm) 68 | if item['pdt_info']: 69 | for product in item['pdt_info']: 70 | pdt = JuziProduct() 71 | pdt.company_id = company.info_id 72 | pdt.pdt_name = product['pdt_name'] 73 | pdt.pdt_type = product['pdt_type'] 74 | pdt.pdt_intro = product['pdt_intro'] 75 | session.add(pdt) 76 | session.commit() 77 | return item 78 | -------------------------------------------------------------------------------- /itjuzi_dis/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for itjuzi_dis project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'itjuzi_dis' 13 | 14 | SPIDER_MODULES = ['itjuzi_dis.spiders'] 15 | NEWSPIDER_MODULE = 'itjuzi_dis.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'itjuzi_dis (+http://www.yourdomain.com)' 20 | 21 | # USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2810.2 Safari/537.36' 22 | 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = False 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | #CONCURRENT_REQUESTS = 32 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | DOWNLOAD_DELAY = 1.5 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'itjuzi_dis.middlewares.MyCustomSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | DOWNLOADER_MIDDLEWARES = { 59 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 80, 60 | # 'itjuzi_dis.middlewares.ProxyMiddleware': 90, 61 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100, 62 | 'itjuzi_dis.middlewares.RotateUserAgentMiddleware': 200, 63 | } 64 | 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'itjuzi_dis.pipelines.DuplicatesPipeline': 200, 75 | 'itjuzi_dis.pipelines.ItjuziSpiderPipeline': 300, 76 | 'scrapy_redis.pipelines.RedisPipeline': 300 77 | } 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | # REDIRECT_ENABLED = False 100 | RANDOMIZE_DOWNLOAD_DELAY = True 101 | # Enables scheduling storing requests queue in redis. 102 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 103 | 104 | # Ensure all spiders share same duplicates filter through redis. 105 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 106 | 107 | # REDIS_START_URLS_AS_SET = True 108 | 109 | REDIS_PARAMS = {'host':'redis','decode_responses':False} 110 | -------------------------------------------------------------------------------- /itjuzi_dis/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /itjuzi_dis/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/spiders/__pycache__/juzi_spider.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hardy4yooz/itjuzi_dis/551d43573b1dd93fdb05900953cd0b79044c8114/itjuzi_dis/spiders/__pycache__/juzi_spider.cpython-35.pyc -------------------------------------------------------------------------------- /itjuzi_dis/spiders/juzi_spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from bs4 import BeautifulSoup 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.spiders import CrawlSpider, Rule 6 | 7 | from scrapy_redis.spiders import RedisCrawlSpider 8 | from itjuzi_dis.items import CompanyItem 9 | 10 | 11 | class ITjuziSpider(RedisCrawlSpider): 12 | name = 'itjuzi_dis' 13 | allowed_domains = ['itjuzi.com'] 14 | # start_urls = ['http://www.itjuzi.com/company/157'] 15 | redis_key = 'itjuziCrawler:start_urls' 16 | rules = [ 17 | # 获取每一页的链接 18 | Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))), 19 | # 获取每一个公司的详情 20 | Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item') 21 | ] 22 | 23 | def parse_item(self, response): 24 | soup = BeautifulSoup(response.body, 'lxml') 25 | 26 | cpy1 = soup.find('div', class_='infoheadrow-v2') 27 | if cpy1: 28 | company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '') 29 | slogan = cpy1.find(class_='info-line').p.get_text() 30 | scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a') 31 | scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else '' 32 | sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else '' 33 | city_a = cpy1.find(class_='loca c-gray-aset').find_all('a') 34 | city = city_a[0].get_text().strip() if len(city_a) > 0 else '' 35 | area = city_a[1].get_text().strip() if len(city_a) > 1 else '' 36 | 37 | home_page = cpy1.find(class_='weblink marl10')['href'] 38 | tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',') 39 | 40 | cpy2 = soup.find('div', class_='block-inc-info on-edit-hide') 41 | if cpy2: 42 | company_intro = cpy2.find(class_='des').get_text().strip() 43 | cpy2_content = cpy2.find(class_='des-more').contents 44 | company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else '' 45 | found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[3] else '' 46 | company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[3] else '' 47 | company_status = cpy2_content[5].get_text().strip() if cpy2_content[5] else '' 48 | 49 | main = soup.find('div', class_='main') 50 | 51 | # 投资 52 | tz = main.find('table', 'list-round-v2') 53 | tz_list = [] 54 | if tz: 55 | all_tr = tz.find_all('tr') 56 | for tr in all_tr: 57 | tz_dict = {} 58 | all_td = tr.find_all('td') 59 | tz_dict['tz_time'] = all_td[0].span.get_text().strip() 60 | tz_dict['tz_round'] = all_td[1].get_text().strip() 61 | tz_dict['tz_finades'] = all_td[2].get_text().strip() 62 | tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',') 63 | tz_list.append(tz_dict) 64 | 65 | # 团队 team 66 | tm = main.find('ul', class_='list-prodcase limited-itemnum') 67 | tm_list = [] 68 | if tm: 69 | for li in tm.find_all('li'): 70 | tm_dict = {} 71 | tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip() 72 | tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip() 73 | tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip() 74 | tm_list.append(tm_dict) 75 | 76 | pdt = main.find('ul', class_='list-prod limited-itemnum') 77 | pdt_list = [] 78 | if pdt: 79 | for li in pdt.find_all('li'): 80 | pdt_dict = {} 81 | pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip() 82 | pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip() 83 | pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip() 84 | pdt_list.append(pdt_dict) 85 | item = CompanyItem() 86 | item['info_id'] = response.url.split('/')[-1:][0] 87 | item['company_name'] = company_name 88 | item['slogan'] = slogan 89 | item['scope'] = scope 90 | item['sub_scope'] = sub_scope 91 | item['city'] = city 92 | item['area'] = area 93 | item['home_page'] = home_page 94 | item['tags'] = tags 95 | item['company_intro'] = company_intro 96 | item['company_full_name'] = company_full_name 97 | item['found_time'] = found_time 98 | item['company_size'] = company_size 99 | item['company_status'] = company_status 100 | item['tz_info'] = tz_list 101 | item['tm_info'] = tm_list 102 | item['pdt_info'] = pdt_list 103 | return item 104 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup4 2 | scrapy 3 | scrapy_redis 4 | redis 5 | sqlalchemy 6 | pymysql 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = itjuzi_dis.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = itjuzi_dis 12 | -------------------------------------------------------------------------------- /spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection 6 | 7 | 8 | # Default batch size matches default concurrent requests setting. 9 | DEFAULT_START_URLS_BATCH_SIZE = 16 10 | DEFAULT_START_URLS_KEY = '%(name)s:start_urls' 11 | 12 | 13 | class RedisMixin(object): 14 | """Mixin class to implement reading urls from a redis queue.""" 15 | # Per spider redis key, default to DEFAULT_KEY. 16 | redis_key = None 17 | # Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE. 18 | redis_batch_size = None 19 | # Redis client instance. 20 | server = None 21 | 22 | def start_requests(self): 23 | """Returns a batch of start requests from redis.""" 24 | return self.next_requests() 25 | 26 | def setup_redis(self, crawler=None): 27 | """Setup redis connection and idle signal. 28 | 29 | This should be called after the spider has set its crawler object. 30 | """ 31 | if self.server is not None: 32 | return 33 | 34 | if crawler is None: 35 | # We allow optional crawler argument to keep backwards 36 | # compatibility. 37 | # XXX: Raise a deprecation warning. 38 | crawler = getattr(self, 'crawler', None) 39 | 40 | if crawler is None: 41 | raise ValueError("crawler is required") 42 | 43 | settings = crawler.settings 44 | 45 | if self.redis_key is None: 46 | self.redis_key = settings.get( 47 | 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY, 48 | ) 49 | 50 | self.redis_key = self.redis_key % {'name': self.name} 51 | 52 | if not self.redis_key.strip(): 53 | raise ValueError("redis_key must not be empty") 54 | 55 | if self.redis_batch_size is None: 56 | self.redis_batch_size = settings.getint( 57 | 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE, 58 | ) 59 | 60 | try: 61 | self.redis_batch_size = int(self.redis_batch_size) 62 | except (TypeError, ValueError): 63 | raise ValueError("redis_batch_size must be an integer") 64 | 65 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 66 | "(batch size: %(redis_batch_size)s)", self.__dict__) 67 | 68 | self.server = connection.from_settings(crawler.settings) 69 | # The idle signal is called when the spider has no requests left, 70 | # that's when we will schedule new requests from redis queue 71 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 72 | 73 | def next_requests(self): 74 | """Returns a request to be scheduled or none.""" 75 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET') 76 | fetch_one = self.server.spop if use_set else self.server.lpop 77 | # XXX: Do we need to use a timeout here? 78 | found = 0 79 | while found < self.redis_batch_size: 80 | data = fetch_one(self.redis_key) 81 | if not data: 82 | # Queue empty. 83 | break 84 | req = self.make_request_from_data(data.decode('utf-8')) 85 | if req: 86 | yield req 87 | found += 1 88 | else: 89 | self.logger.debug("Request not made from data: %r", data) 90 | 91 | if found: 92 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 93 | 94 | def make_request_from_data(self, data): 95 | # By default, data is an URL. 96 | if '://' in data: 97 | return self.make_requests_from_url(data) 98 | else: 99 | self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data) 100 | 101 | def schedule_next_requests(self): 102 | """Schedules a request if available""" 103 | for req in self.next_requests(): 104 | self.crawler.engine.crawl(req, spider=self) 105 | 106 | def spider_idle(self): 107 | """Schedules a request if available, otherwise waits.""" 108 | # XXX: Handle a sentinel to close the spider. 109 | self.schedule_next_requests() 110 | raise DontCloseSpider 111 | 112 | 113 | class RedisSpider(RedisMixin, Spider): 114 | """Spider that reads urls from redis queue when idle.""" 115 | 116 | @classmethod 117 | def from_crawler(self, crawler, *args, **kwargs): 118 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 119 | obj.setup_redis(crawler) 120 | return obj 121 | 122 | 123 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 124 | """Spider that reads urls from redis queue when idle.""" 125 | 126 | @classmethod 127 | def from_crawler(self, crawler, *args, **kwargs): 128 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 129 | obj.setup_redis(crawler) 130 | return obj 131 | --------------------------------------------------------------------------------