├── .coveragerc ├── .travis.yml ├── LICENSE ├── README-SETUP.rst ├── README.md ├── pic ├── feijishi.png ├── jiagou.png ├── jichu.png ├── jigou.png ├── jishi-huoqushuju.png ├── jishi-shouquan.png └── spider.jpg ├── requirements.txt ├── sasila-example ├── car_processor.py ├── fang_processor.py ├── main.py ├── proxy.txt └── settings.py ├── sasila ├── __init__.py ├── settings │ ├── __init__.py │ └── default_settings.py ├── system_instant │ ├── __init__.py │ ├── blueprints │ │ ├── __init__.py │ │ └── jd.py │ ├── crawler │ │ ├── __init__.py │ │ └── jd │ │ │ ├── __init__.py │ │ │ └── request.py │ ├── database │ │ ├── __init__.py │ │ └── jd_database.py │ ├── manager │ │ ├── __init__.py │ │ └── jd_manager.py │ └── settings.py ├── system_normal │ ├── __init__.py │ ├── blueprints │ │ ├── __init__.py │ │ └── slow_spiders.py │ ├── database │ │ └── __init__.py │ ├── downloader │ │ ├── __init__.py │ │ ├── base_downloder.py │ │ ├── http │ │ │ ├── __init__.py │ │ │ ├── selenium_response.py │ │ │ ├── spider_request.py │ │ │ └── spider_response.py │ │ ├── proxy │ │ │ ├── __init__.py │ │ │ └── proxy_pool.py │ │ ├── requests_downloader.py │ │ ├── selenium_downloader.py │ │ └── web_driver_pool.py │ ├── loginer │ │ ├── __init__.py │ │ ├── base_loginer.py │ │ └── jd_loginer.py │ ├── manager │ │ ├── __init__.py │ │ └── spider_manager.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── base_pipeline.py │ │ ├── console_pipeline.py │ │ ├── kafa_pipeline.py │ │ ├── pic_pipeline.py │ │ ├── pipe_item.py │ │ ├── test_pipeline.py │ │ └── text_pipeline.py │ ├── processor │ │ ├── __init__.py │ │ ├── base_processor.py │ │ ├── bendibao_processor.py │ │ ├── car_processor.py │ │ ├── city.txt │ │ ├── city_location_processor.py │ │ ├── fang_processor.py │ │ ├── fang_shop_processor.py │ │ ├── fe_loan_processor.py │ │ ├── first_processor.py │ │ ├── mzitu_proccessor.py │ │ ├── mzitu_proccessor_regex.py │ │ ├── qcc_processor.py │ │ └── test_processor.py │ ├── scheduler │ │ ├── __init__.py │ │ ├── bloom_filter.py │ │ └── queue.py │ ├── spider │ │ ├── __init__.py │ │ └── spider_core.py │ └── utils │ │ ├── __init__.py │ │ ├── cookie.py │ │ ├── decorator.py │ │ ├── httpobj.py │ │ ├── jd_code.py │ │ ├── kafka_utils.py │ │ ├── progress_bar.py │ │ ├── python.py │ │ └── reqser.py └── system_web │ └── __init__.py ├── setup.py └── tests ├── __init__.py └── test_processor.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | sasila 4 | parallel = True 5 | 6 | [report] 7 | omit = 8 | sasila/system_instant/* 9 | sasila/system_normal/processor/* 10 | 11 | exclude_lines = 12 | pragma: no cover 13 | def __repr__ 14 | if self.debug: 15 | if settings.DEBUG 16 | raise AssertionError 17 | raise NotImplementedError 18 | if 0: 19 | if __name__ == .__main__.: 20 | except ImportError: 21 | pass 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | cache: pip 4 | python: 5 | - "2.7" 6 | - "3.5" 7 | services: 8 | - redis-server 9 | install: 10 | - pip install --no-use-wheel lxml 11 | - pip install --allow-all-external -e .[test] 12 | - pip install coveralls 13 | script: 14 | - coverage run setup.py test 15 | after_success: 16 | - coverage combine 17 | - coveralls 18 | 19 | deploy: 20 | provider: pypi 21 | skip_cleanup: true 22 | distributions: "sdist bdist_wheel" 23 | user: 'DaVinciDW' 24 | password: 25 | secure: "Uh+Q37dRElSmZ1YxiGv9aeg59xyCJ6dSJ87L/P1dFowHx267dX4l9xU9v3skrQSBKEIe8JjCHHT3b9D//1daObWuJ1PHQ6IdD5pp7Lwl2CkBW1TOP9MjAcZV9F0udH3X986owP8KCuwoVJglLWch+3FtI7iNpdrlcIUXlgwS4eAAfF6DmUJG5BSiHDfCdvEyLE2D13MqyXqWNixU9FQ6/5IPfEVrJsW0W0s+fUnvPNSq/R4l9oHrkhUb+2oI7OwYcCG+wXz6KOZaSn69a/sOPRI3thfc9v2FWKsz+XvBhqvNA67q2Q1kHaIn+KZnct+ZJD2tK4NrEJznf4mBliLT31YVsvYHmnsfO34+3W5G+PVdywE2j63uKAFVRzWfYRBVD1UAr0yFuCPD3Ghh7GzHFXEZm5Tltbng2BZQT82BxY4B8IPgHUMf418wRiOBKDGPSoZiHBXVtjwbWez36HOaMenXurLMaoCDWsUzl4QIJF723L5fS/z5Xq8iOoMo+5bsEIfp6BpsYh33n1zL887p03IFJHRnFlCPjdZJ7cQnBV2HTPwUNrls6c8DzaMncUj5W203k48nHm6YhspeS+uIEIrz2eCOgYD5AjjeBRsZfXlG6+DC0+O7Srnuih61xR0vJXQ9PpYCoPI5BMgQo+xwJJz2BP5IX7IpZ2HWJHFKC0E=" 26 | on: 27 | branch: master 28 | tags: false 29 | repo: DarkSand/Sasila 30 | condition: $TRAVIS_PYTHON_VERSION = "2.7" 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014 Binux 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README-SETUP.rst: -------------------------------------------------------------------------------- 1 | Sasila 2 | ====== 3 | .. image:: https://img.shields.io/badge/version-0.0.1-green.svg 4 | :target: https://pypi.python.org/pypi/Sasila 5 | :alt: Sasila Version 6 | 7 | .. image:: https://img.shields.io/badge/pypi-v1.4.0-green.svg 8 | :target: https://pypi.python.org/pypi/Sasila 9 | :alt: Wheel Status 10 | 11 | Overview 12 | ======== 13 | sasila is a simple spider system 14 | 15 | Install 16 | ======= 17 | 18 | The quick way:: 19 | 20 | pip install sasila 21 | 22 | Tutorial 23 | ======= 24 | car_processor.py:: 25 | 26 | #!/usr/bin/env python 27 | # -*- coding: utf-8 -*- 28 | from sasila.system_normal.spider.spider_core import SpiderCore 29 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 30 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar 31 | from sasila.system_normal.processor.base_processor import BaseProcessor 32 | from sasila.system_normal.downloader.http.spider_request import Request 33 | from sasila.system_normal.utils.decorator import checkResponse 34 | 35 | from bs4 import BeautifulSoup as bs 36 | import json 37 | import time 38 | import sys 39 | 40 | reload(sys) 41 | sys.setdefaultencoding("utf-8") 42 | 43 | 44 | class Car_Processor(BaseProcessor): 45 | spider_id = "car_spider" 46 | spider_name = "car_spider" 47 | allowed_domains = ["che168.com"] 48 | start_requests = [Request(url="http://www.che168.com", priority=0)] 49 | 50 | @checkResponse 51 | def process(self, response): 52 | soup = bs(response.m_response.content, "lxml") 53 | province_div_list = soup.select("div.city-list div.cap-city > div.fn-clear") 54 | for province_div in province_div_list: 55 | province_name = province_div.select("span.capital a")[0].text 56 | city_list = province_div.select("div.city a") 57 | for city in city_list: 58 | city_name = city.text 59 | pinyin = city["href"].strip("/").split("/")[0] 60 | request = Request( 61 | url="http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s" % pinyin, 62 | priority=1, callback=self.process_page_1) 63 | request.meta["province"] = province_name 64 | request.meta["city"] = city_name 65 | yield request 66 | 67 | @checkResponse 68 | def process_page_1(self, response): 69 | brand_list = list(json.loads(response.m_response.content.decode("gb2312"))) 70 | for brand in brand_list: 71 | brand_dict = dict(brand) 72 | brand_name = brand_dict["name"] 73 | url = response.nice_join(brand_dict["url"]) + "/" 74 | request = Request(url=url, priority=2, callback=self.process_page_2) 75 | request.meta["province"] = response.request.meta["province"] 76 | request.meta["city"] = response.request.meta["city"] 77 | request.meta["brand"] = brand_name 78 | yield request 79 | 80 | @checkResponse 81 | def process_page_2(self, response): 82 | soup = bs(response.m_response.content, "lxml") 83 | cars_line_list = soup.select("div#series div.content-area dl.model-list dd a") 84 | for cars_line in cars_line_list: 85 | cars_line_name = cars_line.text 86 | url = "http://www.che168.com" + cars_line["href"] 87 | request = Request(url=url, priority=3, callback=self.process_page_3) 88 | request.meta["province"] = response.request.meta["province"] 89 | request.meta["city"] = response.request.meta["city"] 90 | request.meta["brand"] = response.request.meta["brand"] 91 | request.meta["cars_line"] = cars_line_name 92 | yield request 93 | 94 | @checkResponse 95 | def process_page_3(self, response): 96 | soup = bs(response.m_response.content, "lxml") 97 | car_info_list = soup.select("div#a2 ul#viewlist_ul li a.carinfo") 98 | for car_info in car_info_list: 99 | url = "http://www.che168.com" + car_info["href"] 100 | request = Request(url=url, priority=4, callback=self.process_page_4) 101 | request.meta["province"] = response.request.meta["province"] 102 | request.meta["city"] = response.request.meta["city"] 103 | request.meta["brand"] = response.request.meta["brand"] 104 | request.meta["cars_line"] = response.request.meta["cars_line"] 105 | yield request 106 | next_page = soup.find(lambda tag: tag.name == "a" and "下一页" in tag.text) 107 | if next_page: 108 | url = "http://www.che168.com" + next_page["href"] 109 | request = Request(url=url, priority=3, callback=self.process_page_3) 110 | request.meta["province"] = response.request.meta["province"] 111 | request.meta["city"] = response.request.meta["city"] 112 | request.meta["brand"] = response.request.meta["brand"] 113 | request.meta["cars_line"] = response.request.meta["cars_line"] 114 | yield request 115 | 116 | @checkResponse 117 | def process_page_4(self, response): 118 | soup = bs(response.m_response.content, "lxml") 119 | # Object moved 120 | #

Object moved to here.

121 | # 122 | if len(soup.select("div.car-title h2")) != 0: 123 | car = soup.select("div.car-title h2")[0].text 124 | detail_list = soup.select("div.details li") 125 | if len(detail_list) == 0: 126 | soup = bs(response.m_response.content, "html5lib") 127 | detail_list = soup.select("div.details li") 128 | mileage = detail_list[0].select("span")[0].text.replace("万公里", "") 129 | first_borad_date = detail_list[1].select("span")[0].text 130 | gear = detail_list[2].select("span")[0].text.split("/")[0] 131 | displacement = detail_list[2].select("span")[0].text.split("/")[1] 132 | price = soup.select("div.car-price ins")[0].text.replace("¥", "") 133 | crawl_date = time.strftime("%Y-%m-%d", time.localtime(time.time())) 134 | 135 | item = dict() 136 | item["car"] = car 137 | item["mileage"] = mileage 138 | item["first_borad_date"] = first_borad_date 139 | item["gear"] = gear 140 | item["displacement"] = displacement 141 | item["price"] = price 142 | item["crawl_date"] = crawl_date 143 | 144 | item["province"] = response.request.meta["province"] 145 | item["city"] = response.request.meta["city"] 146 | item["brand"] = response.request.meta["brand"] 147 | item["cars_line"] = response.request.meta["cars_line"] 148 | yield item 149 | 150 | main.py:: 151 | 152 | #!/usr/bin/env python 153 | # -*- coding: utf-8 -*- 154 | from car_processor import Car_Processor 155 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 156 | from sasila.system_normal.spider.spider_core import SpiderCore 157 | from sasila.system_normal.manager import manager 158 | import sasila 159 | 160 | spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()) 161 | manager.set_spider(spider_car) 162 | sasila.start() 163 | 164 | then start your redis and run script:: 165 | 166 | python main.py 167 | 168 | then start your spider in your browser:: 169 | 170 | http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider 171 | 172 | you can stop spider:: 173 | 174 | http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider 175 | 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sasila [![PyPI Version]][PyPI] [![Build Status]][Travis CI] [![Coverage Status]][Coverage] 2 | 3 |   现在有很多爬虫框架,比如[**scrapy**](https://github.com/scrapy/scrapy)、[**webmagic**](https://github.com/code4craft/webmagic)、[**pyspider**](https://github.com/binux/pyspider)都可以在爬虫工作中使用,也可以直接通过[**requests**](https://github.com/requests/requests)+[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)来写一些个性化的小型爬虫脚本。但是在实际爬取过程当中,爬虫框架各自有优势和缺陷。比如scrapy,它的功能强大,但过于强大的功能也许反而让新手无所适从,并且它采用twisted异步框架开发,对新手来说源码难以理解,项目难于调试。所以我模仿这些爬虫框架的优势,以尽量简单的原则,搭配gevent(实际上是grequests)开发了这套轻量级爬虫框架。 4 | 5 | ![jiagou](https://github.com/DarkSand/Sasila/blob/master/pic/jigou.png) 6 | 7 | * downloader是下载器。 8 | * processor是解析器。 9 | * scheduler是调度器。 10 | * pipeline是数据处理器。 11 | * 将下载器,解析器,调度器,数据处理器注入核心core成为spider对象。 12 | * 通过manager管理spider对象。 13 | * manager透过webapi提供外部访问/控制接口。 14 | 15 | ## **主要特点** 16 | 17 | * 框架代码结构简单易用,易于修改。新手、老鸟皆可把控。 18 | * 采用gevent实现并发操作,与scrapy的twisted相比,代码更容易理解。 19 | * 完全模块化的设计,强大的可扩展性。 20 | * 使用方式和结构参考了[**scrapy**](https://github.com/scrapy/scrapy)和[**webmagic**](https://github.com/code4craft/webmagic)。对有接触过这两个框架的朋友非常友好。 21 | * 不采用命令行来启动爬虫,方便调试。 22 | * 对数据的解析模块并没有集成,可以自由使用[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)、[**lxml**](https://github.com/lxml/lxml)、[**pyquery**](https://github.com/gawel/pyquery)、[**html5lib**](https://github.com/html5lib/html5lib-python)等等各种解析器进行数据抽取。 23 | * 集成代理换IP功能。 24 | * 支持高并发抓取数据。 25 | * 支持分布式。 26 | * 支持增量爬取。 27 | * 支持爬取js动态渲染的页面(加载SeleniumDownLoader即可)。 28 | * 提供webapi对爬虫进行管理、监控。 29 | * 提供即时爬虫的集成思路和结构。 30 | 31 | ## **安装** 32 | ``` 33 | pip install sasila 34 | ``` 35 | ## **准备** 36 | * 请准备好您的redis服务器进行调度。 37 | * 并在settings.py文件中 写入您的redis服务器地址 38 | ```python 39 | REDIS_HOST = 'localhost' 40 | REDIS_PORT = 6379 41 | ``` 42 | ## **构建processor(解析器)** 43 | ```python 44 | #!/usr/bin/env python 45 | # -*- coding: utf-8 -*- 46 | from bs4 import BeautifulSoup as bs 47 | from sasila.system_normal.processor.base_processor import BaseProcessor 48 | from sasila.system_normal.downloader.http.spider_request import Request 49 | from sasila.system_normal.spider.spider_core import SpiderCore 50 | 51 | class Mzi_Processor(BaseProcessor): 52 | spider_id = 'mzi_spider' 53 | spider_name = 'mzi_spider' 54 | allowed_domains = ['mzitu.com'] 55 | start_requests = [Request(url='http://www.mzitu.com/', priority=0)] 56 | 57 | @checkResponse 58 | def process(self, response): 59 | soup = bs(response.m_response.content, 'lxml') 60 | print soup.title.string 61 | href_list = soup.select('a') 62 | for href in href_list: 63 | yield Request(url=response.nice_join(href['href'])) 64 | ``` 65 | **写法与scrapy几乎一样** 66 | 67 | * 所有的解析器都继承自 *BaseProcessor* ,默认入口解析函数为def process(self, response)。 68 | * 为该解析器设置spider_id和spider_name,以及限定域名。 69 | * 初始爬取请求为 *start_requests*,构建Request对象,该对象支持GET、POST方法,支持优先级,设置回调函数等等所有构建request对象的一切属性。默认回调函数为 *process*。 70 | * 可以使用@checkResponse装饰器对返回的 *response* 进行校验并记录异常日志。你也可以定义自己的装饰器。 71 | * 解析函数因为使用 *yield* 关键字,所以是一个生成器。当 *yield* 返回 *Request* 对象,则会将 *Request* 对象推入调度器等待调度继续进行爬取。若 *yield* 不是返回 *Request* 对象则会进入 *pipeline* , *pipeline* 将对数据进行清洗入库等操作。 72 | 73 | **与scrapy相似,sasila同样提供*LinkExtractor的*方式来提取链接,以下是用*LinkExtractor*的方式构造*processor*下载妹子图的示例** 74 | 75 | ```python 76 | #!/usr/bin/env python 77 | # -*- coding: utf-8 -*- 78 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor 79 | from sasila.system_normal.downloader.http.spider_request import Request 80 | import os 81 | import uuid 82 | 83 | class MezituProcessor(BaseProcessor): 84 | spider_id = 'mzitu' 85 | spider_name = 'mzitu' 86 | allowed_domains = ['mzitu.com', 'meizitu.net'] 87 | start_requests = [Request(url='http://www.mzitu.com/xinggan/')] 88 | 89 | rules = ( 90 | Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),callback="save", priority=3), 91 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1), 92 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2), 93 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0), 94 | ) 95 | 96 | def save(self, response): 97 | if response.m_response: 98 | if not os.path.exists("img"): 99 | os.mkdir("img") 100 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs: 101 | fs.write(response.m_response.content) 102 | print("download success!") 103 | ``` 104 | 105 | **LinkExtractor的构造方式为** 106 | 107 | ```python 108 | LinkExtractor(regex_str=None, css_str=None, process_value=None) 109 | ``` 110 | 111 | * 提供正则表达式提取方式:*regex_str* 112 | * 提供css选择器提取方式:*css_str* 113 | * 也可以自定义*process_value*来提取链接,其中*process_value*是一个生成器 114 | * 若使用此方式构造*processor*,请不要定义默认入口函数def process(self, response) 115 | 116 | 117 | ## **构建pipeline** 118 | 该pipeline获取数据后将数据转为json格式,并输出到屏幕 119 | ```python 120 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 121 | 122 | class ConsolePipeline(ItemPipeline): 123 | def process_item(self, item): 124 | print json.dumps(item).decode("unicode-escape") 125 | ``` 126 | ## **构建spider(爬虫对象)** 127 | * 通过注入 *processor* 生成spider对象 128 | ```python 129 | from sasila.system_normal.spider.spider_core import SpiderCore 130 | 131 | spider = SpiderCore(Mzi_Processor()) 132 | ``` 133 | * RequestSpider对象包含批下载数量 *batch_size*,下载间隔 *time_sleep*,使用代理 *use_proxy* 等一切必要的属性 134 | ```python 135 | SpiderCore(processor=None, downloader=None, use_proxy=False,scheduler=None,batch_size=None,time_sleep=None) 136 | ``` 137 | * 本项目集成使用代理IP的功能,只要在构建RequestSpider时将 *use_proxy* 设置为 *True*,并在脚本同级目录下放置proxy.txt文件即可。你也可以在settings.py文件中写入代理IP文件路径。 138 | ```python 139 | PROXY_PATH_REQUEST = 'proxy/path' 140 | ``` 141 | * proxy.txt文件中请写入代理IP,格式为:IP,端口号。若该代理IP有账号密码,在末尾追加账号密码即可。 142 | ```text 143 | 127.0.0.1,8080 144 | 127.0.0.2,8080,user,pwd 145 | 127.0.0.3,8080,user,pwd 146 | ``` 147 | * RequestSpider已经默认设置好了 *downloader* 和 *scheduler*,如果不满意,可以自己进行定制。 148 | * 可以为spider设置 *downloader* 和 *pipeline* 甚至 *scheduler* 149 | ```python 150 | spider = spider.set_pipeline(ConsolePipeline()) 151 | ``` 152 | * 可以通过该方式启动爬虫 153 | ```python 154 | spider.start() 155 | ``` 156 | * 也可以将spider注入*manager*进行管理 157 | ```python 158 | from sasila.system_normal.manager import manager 159 | from sasila import system_web 160 | 161 | manager.set_spider(spider) 162 | 163 | system_web.start() 164 | ``` 165 | 166 | 访问 http://127.0.0.1:5000/slow_spider/start?spider_id=mzi_spider 来启动爬虫。 167 | 168 | 访问 http://127.0.0.1:5000/slow_spider/stop?spider_id=mzi_spider 来停止爬虫。 169 | 170 | 访问 http://127.0.0.1:5000/slow_spider/detail?spider_id=mzi_spider 来查看爬虫详细信息。 171 | 172 | ## **针对需要登录才能爬取的处理办法** 173 | * 可以为downloader加载登录器(loginer),在使用downloader的时候使用loginer进行登录获取cookies,再进行爬取 174 | * 也可以自己定义一个cookie池,批量进行登录并将登录成功的cookies放进cookie池中随时进行取用。项目中暂时没有这些功能。欢迎pull request~ 175 | 176 | ## **架构** 177 | ![jichu](https://github.com/DarkSand/Sasila/blob/master/pic/jichu.png) 178 | 179 | * 任务由 scheduler 发起调度,downloader 抓取网页内容, processor 执行预先编写的py脚本,输出结果或产生新的提链任务(发往 scheduler),形成闭环。 180 | * 每个脚本被认为是一个spider,spiderid确定一个任务。 181 | * downloader 182 | 1. method, header, cookie, proxy,timeout 等等抓取调度控制。 183 | 2. 可以通过适配类似 phantomjs 的webkit引擎支持渲染。 184 | * processor 185 | 1. 灵活运用pyquery,beautifulsoup等解析页面。 186 | 2. 在脚本中完全控制调度抓取的各项参数。 187 | 3. 可以向后链传递信息。 188 | 4. 异常捕获。 189 | * scheduler 190 | 1. 任务优先级。 191 | 2. 对任务进行监控。 192 | 3. 对任务进行去重等操作。 193 | 4. 支持增量。 194 | * webApi 195 | 1. 对爬虫进行增删改查等操作。 196 | * 非及时爬虫流程图 197 | 198 | ![feijishi](https://github.com/DarkSand/Sasila/blob/master/pic/feijishi.png) 199 | 200 | ## **即时爬虫** 201 | 即时爬虫是可以通过api调用,传入需要爬取的页面或者需求,即时爬取数据并返回结果。现阶段开发并不完善。仅提供思路参考。示例核心代码在 *sasila.system_instant* 中。 202 | 203 | * 即时爬虫-获取数据流程图 204 | 205 | ![huoqushuju](https://github.com/DarkSand/Sasila/blob/master/pic/jishi-huoqushuju.png) 206 | 207 | * 即时爬虫-授权流程图 208 | 209 | ![shouquan](https://github.com/DarkSand/Sasila/blob/master/pic/jishi-shouquan.png) 210 | 211 | ## **为啥叫Sasila?** 212 | 213 | ![spider](https://github.com/DarkSand/Sasila/blob/master/pic/spider.jpg) 214 | 215 | 作为一个wower,你可以猜到吗ヾ( ̄▽ ̄) 216 | 217 | ## **fetchman** 218 | 219 | 现提供更好用的爬虫框架[**fetchman**](https://github.com/DarkSand/fetchman),在sasila的基础上做了更多优化和修复并移除web相关功能(个人感觉有点鸡肋)。 220 | 221 | 222 | [Build Status]: https://img.shields.io/travis/DarkSand/Sasila.svg?branch=master&style=flat 223 | [Travis CI]: https://travis-ci.org/DarkSand/Sasila 224 | [Coverage Status]: https://img.shields.io/coveralls/DarkSand/Sasila.svg?branch=master&style=flat 225 | [Coverage]: https://coveralls.io/github/DarkSand/Sasila 226 | [PyPI Version]: https://img.shields.io/pypi/v/Sasila.svg 227 | [PyPI]: https://pypi.python.org/pypi/sasila 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /pic/feijishi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/feijishi.png -------------------------------------------------------------------------------- /pic/jiagou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jiagou.png -------------------------------------------------------------------------------- /pic/jichu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jichu.png -------------------------------------------------------------------------------- /pic/jigou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jigou.png -------------------------------------------------------------------------------- /pic/jishi-huoqushuju.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-huoqushuju.png -------------------------------------------------------------------------------- /pic/jishi-shouquan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-shouquan.png -------------------------------------------------------------------------------- /pic/spider.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/spider.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.11.1 2 | redis==2.10.5 3 | requests==2.13.0 4 | six==1.10.0 5 | SQLAlchemy==1.1.4 6 | grequests==0.3.0 7 | selenium==2.53.6 8 | lxml==3.7.2 9 | beautifulsoup4==4.6.0 10 | -------------------------------------------------------------------------------- /sasila-example/car_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from sasila.system_normal.spider.spider_core import SpiderCore 4 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 5 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar 6 | from sasila.system_normal.processor.base_processor import BaseProcessor 7 | from sasila.system_normal.downloader.http.spider_request import Request 8 | from sasila.system_normal.utils.decorator import checkResponse 9 | 10 | from bs4 import BeautifulSoup as bs 11 | import json 12 | import time 13 | import sys 14 | 15 | if sys.version_info < (3, 0): 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class Car_Processor(BaseProcessor): 21 | spider_id = 'car_spider' 22 | spider_name = 'car_spider' 23 | allowed_domains = ['che168.com'] 24 | start_requests = [Request(url='http://www.che168.com', priority=0)] 25 | 26 | @checkResponse 27 | def process(self, response): 28 | soup = bs(response.m_response.content, 'lxml') 29 | province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear') 30 | for province_div in province_div_list: 31 | province_name = province_div.select('span.capital a')[0].text 32 | city_list = province_div.select('div.city a') 33 | for city in city_list: 34 | city_name = city.text 35 | pinyin = city['href'].strip('/').split('/')[0] 36 | request = Request( 37 | url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin, 38 | priority=1, callback=self.process_page_1) 39 | request.meta['province'] = province_name 40 | request.meta['city'] = city_name 41 | yield request 42 | 43 | @checkResponse 44 | def process_page_1(self, response): 45 | brand_list = list(json.loads(response.m_response.content.decode('gb2312'))) 46 | for brand in brand_list: 47 | brand_dict = dict(brand) 48 | brand_name = brand_dict['name'] 49 | url = response.nice_join(brand_dict['url']) + '/' 50 | request = Request(url=url, priority=2, callback=self.process_page_2) 51 | request.meta['province'] = response.request.meta['province'] 52 | request.meta['city'] = response.request.meta['city'] 53 | request.meta['brand'] = brand_name 54 | yield request 55 | 56 | @checkResponse 57 | def process_page_2(self, response): 58 | soup = bs(response.m_response.content, 'lxml') 59 | cars_line_list = soup.select('div#series div.content-area dl.model-list dd a') 60 | for cars_line in cars_line_list: 61 | cars_line_name = cars_line.text 62 | url = 'http://www.che168.com' + cars_line['href'] 63 | request = Request(url=url, priority=3, callback=self.process_page_3) 64 | request.meta['province'] = response.request.meta['province'] 65 | request.meta['city'] = response.request.meta['city'] 66 | request.meta['brand'] = response.request.meta['brand'] 67 | request.meta['cars_line'] = cars_line_name 68 | yield request 69 | 70 | @checkResponse 71 | def process_page_3(self, response): 72 | soup = bs(response.m_response.content, 'lxml') 73 | car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo') 74 | for car_info in car_info_list: 75 | url = 'http://www.che168.com' + car_info['href'] 76 | request = Request(url=url, priority=4, callback=self.process_page_4) 77 | request.meta['province'] = response.request.meta['province'] 78 | request.meta['city'] = response.request.meta['city'] 79 | request.meta['brand'] = response.request.meta['brand'] 80 | request.meta['cars_line'] = response.request.meta['cars_line'] 81 | yield request 82 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text) 83 | if next_page: 84 | url = 'http://www.che168.com' + next_page['href'] 85 | request = Request(url=url, priority=3, callback=self.process_page_3) 86 | request.meta['province'] = response.request.meta['province'] 87 | request.meta['city'] = response.request.meta['city'] 88 | request.meta['brand'] = response.request.meta['brand'] 89 | request.meta['cars_line'] = response.request.meta['cars_line'] 90 | yield request 91 | 92 | @checkResponse 93 | def process_page_4(self, response): 94 | soup = bs(response.m_response.content.decode('gb2312', 'ignore'), 'lxml') 95 | # Object moved 96 | #

Object moved to here.

97 | # 98 | if len(soup.select('div.car-title h2')) != 0: 99 | car = soup.select('div.car-title h2')[0].text 100 | detail_list = soup.select('div.details li') 101 | if len(detail_list) == 0: 102 | soup = bs(response.m_response.content, 'html5lib') 103 | detail_list = soup.select('div.details li') 104 | mileage = detail_list[0].select('span')[0].text.replace('万公里', '') 105 | first_borad_date = detail_list[1].select('span')[0].text 106 | gear = detail_list[2].select('span')[0].text.split('/')[0] 107 | displacement = detail_list[2].select('span')[0].text.split('/')[1] 108 | price = soup.select('div.car-price ins')[0].text.replace('¥', '') 109 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 110 | 111 | item = dict() 112 | item['car'] = car 113 | item['mileage'] = mileage 114 | item['first_borad_date'] = first_borad_date 115 | item['gear'] = gear 116 | item['displacement'] = displacement 117 | item['price'] = price 118 | item['crawl_date'] = crawl_date 119 | 120 | item['province'] = response.request.meta['province'] 121 | item['city'] = response.request.meta['city'] 122 | item['brand'] = response.request.meta['brand'] 123 | item['cars_line'] = response.request.meta['cars_line'] 124 | yield item 125 | 126 | 127 | if __name__ == '__main__': 128 | SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineCar()).start() 129 | -------------------------------------------------------------------------------- /sasila-example/fang_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang 9 | 10 | from sasila.system_normal.processor.base_processor import BaseProcessor 11 | from sasila.system_normal.downloader.http.spider_request import Request 12 | import time 13 | from sasila.system_normal.utils.decorator import checkResponse 14 | 15 | if sys.version_info < (3, 0): 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class Fang_Processor(BaseProcessor): 21 | spider_id = 'fang_spider' 22 | spider_name = 'fang_spider' 23 | allowed_domains = ['fang.com'] 24 | start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)] 25 | 26 | @checkResponse 27 | def process(self, response): 28 | soup = bs(response.m_response.content, 'lxml') 29 | province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'} 30 | province_div_list = soup.select('div#c02 ul li') 31 | for province_div in province_div_list: 32 | province_name = province_div.select('strong')[0].text 33 | if province_name != '其他': 34 | if province_name in province_list: 35 | city_list = province_div.select('a') 36 | for city in city_list: 37 | city_name = city.text 38 | url = city['href'] 39 | request = Request(url=url, priority=1, callback=self.process_page_1) 40 | request.meta['province'] = province_name 41 | request.meta['city'] = city_name 42 | yield request 43 | 44 | @checkResponse 45 | def process_page_1(self, response): 46 | soup = bs(response.m_response.content, 'lxml') 47 | district_list = soup.select('div.qxName a') 48 | district_list.pop(0) 49 | for district in district_list: 50 | district_name = district.text 51 | url = response.request.url + district['href'] 52 | request = Request(url=url, priority=2, callback=self.process_page_2) 53 | request.meta['province'] = response.request.meta['province'] 54 | request.meta['city'] = response.request.meta['city'] 55 | request.meta['district'] = district_name 56 | yield request 57 | 58 | @checkResponse 59 | def process_page_2(self, response): 60 | soup = bs(response.m_response.content, 'lxml') 61 | avg_price_list = soup.select('div.newcardR dl') 62 | if len(avg_price_list) > 0: 63 | avg_price = avg_price_list[1].select('dd b')[0].text 64 | else: 65 | avg_price = '未知' 66 | detail_list = soup.select('div.houseList dl') 67 | for detail in detail_list: 68 | if len(detail.select('p.mt10 a span')) != 0: 69 | estate = detail.select('p.mt10 a span')[0].text 70 | area = detail.select('div.area p')[0].text.replace('㎡', '') 71 | layout = detail.select('p.mt12')[0].text.split('|')[0].strip() 72 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text 73 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 74 | item = dict() 75 | item['avg_price'] = avg_price 76 | item['estate'] = estate 77 | item['area'] = area 78 | item['layout'] = layout 79 | item['total_price'] = total_price 80 | item['crawl_date'] = crawl_date 81 | 82 | item['province'] = response.request.meta['province'] 83 | item['city'] = response.request.meta['city'] 84 | item['district'] = response.request.meta['district'] 85 | item['url'] = response.request.url 86 | yield item 87 | 88 | next_page = soup.select('a#PageControl1_hlk_next') 89 | if len(next_page) > 0: 90 | url = response.nice_join(next_page[0]['href']) 91 | request = Request(url=url, priority=2, callback=self.process_page_2) 92 | request.meta['province'] = response.request.meta['province'] 93 | request.meta['city'] = response.request.meta['city'] 94 | request.meta['district'] = response.request.meta['district'] 95 | yield request 96 | 97 | 98 | if __name__ == '__main__': 99 | spider = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineFang()).start() 100 | -------------------------------------------------------------------------------- /sasila-example/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import os 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from car_processor import Car_Processor 9 | from fang_processor import Fang_Processor 10 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 11 | from sasila.system_normal.spider.spider_core import SpiderCore 12 | from sasila.system_normal.manager import manager 13 | from sasila import system_web 14 | 15 | if __name__ == '__main__': 16 | spider_car = SpiderCore(Car_Processor(),batch_size=100).set_pipeline(ConsolePipeline()) 17 | spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()) 18 | manager.set_spider(spider_car) 19 | manager.set_spider(spider_fang) 20 | system_web.start() 21 | -------------------------------------------------------------------------------- /sasila-example/proxy.txt: -------------------------------------------------------------------------------- 1 | 127.0.0.1,8888 -------------------------------------------------------------------------------- /sasila-example/settings.py: -------------------------------------------------------------------------------- 1 | # settings 2 | 3 | # phantomjs'useragent 4 | # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 5 | 6 | # phantomjs'path 7 | # PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe' 8 | 9 | # phantomjs'service 10 | # PHANTOMJS_SERVICE = [ 11 | # '--proxy=localhost:8888', 12 | # '--proxy-type=http', 13 | # # '--proxy-auth=username:password' 14 | # ] 15 | 16 | # phatomjs'pool size 17 | # DRIVER_POOL_SIZE = 5 18 | 19 | # proxy'path 20 | # PROXY_PATH_REQUEST = 'proxy/path' 21 | 22 | # redis host 23 | # REDIS_HOST = 'localhost' 24 | 25 | # redis port 26 | REDIS_PORT = 6379 27 | -------------------------------------------------------------------------------- /sasila/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sasila.settings 4 | 5 | __version__ = '0.0.26' 6 | -------------------------------------------------------------------------------- /sasila/settings/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import imp 4 | import sys 5 | import os 6 | 7 | if sys.version_info < (3, 0): 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | 11 | import sasila.settings.default_settings 12 | 13 | setting_path = os.path.join(os.getcwd(), 'settings.py') 14 | 15 | # 如果运行目录存在settings.py文件,则对默认设置进行覆写 16 | if os.path.exists(setting_path): 17 | new_settings = imp.load_source('settings', setting_path) 18 | 19 | new_settings_dict = dict() 20 | for key in dir(new_settings): 21 | if key.isupper(): 22 | new_settings_dict[key] = getattr(new_settings, key) 23 | if sys.version_info < (3, 0): 24 | for key, value in new_settings_dict.iteritems(): 25 | setattr(default_settings, key, value) 26 | else: 27 | for key, value in new_settings_dict.items(): 28 | setattr(default_settings, key, value) -------------------------------------------------------------------------------- /sasila/settings/default_settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import os 5 | 6 | if sys.version_info < (3, 0): 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | BASE_DIR = os.getcwd() 11 | 12 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 13 | 14 | PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe' 15 | 16 | # PHANTOMJS_SERVICE = [ 17 | # '--proxy=localhost:8888', 18 | # '--proxy-type=http', 19 | # # '--proxy-auth=username:password' 20 | # ] 21 | 22 | PHANTOMJS_SERVICE = None 23 | 24 | DRIVER_POOL_SIZE = 5 25 | 26 | PROXY_PATH_REQUEST = os.path.join(BASE_DIR, 'proxy.txt') 27 | 28 | REDIS_HOST = 'localhost' 29 | 30 | REDIS_PORT = 6379 31 | 32 | -------------------------------------------------------------------------------- /sasila/system_instant/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/blueprints/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/blueprints/jd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import json 5 | from flask import Blueprint 6 | from flask import request 7 | from sasila.system_instant.manager.jd_manager import JdManager 8 | 9 | if sys.version_info < (3, 0): 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | im_jd = Blueprint('im_jd', __name__) 14 | 15 | jd_manager = JdManager() 16 | 17 | 18 | @im_jd.route('/login') 19 | def login(): 20 | return jd_manager.login(request.args['collect_token'], request.args['account'], request.args['password']) 21 | 22 | 23 | @im_jd.route('/qrlogin') 24 | def qr_login(): 25 | message = jd_manager.qrlogin(request.args['collect_token']) 26 | # result = '
' + message 27 | # return result 28 | return message 29 | 30 | 31 | @im_jd.route('/submit_qrlogin') 32 | def submit_qrlogin(): 33 | return jd_manager.submit_qrlogin(request.args['collect_token']) 34 | -------------------------------------------------------------------------------- /sasila/system_instant/crawler/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/crawler/jd/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/crawler/jd/request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import json 4 | import sys 5 | import time 6 | import requests 7 | from bs4 import BeautifulSoup as bs 8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool 9 | from sasila.system_normal.utils.cookie import formart_selenium_cookies 10 | from sasila.system_normal.utils import logger 11 | from sasila.system_normal.utils import jd_code 12 | 13 | if sys.version_info < (3, 0): 14 | reload(sys) 15 | sys.setdefaultencoding('utf-8') 16 | 17 | 18 | def abstract(text, start, end): 19 | if text is None or text == '': 20 | return '' 21 | res = '' 22 | if start is not None and start != '': 23 | if start not in text: 24 | return res 25 | else: 26 | text = text[text.index(start) + len(start):] 27 | if end is not None and end != '': 28 | if end not in text: 29 | return res 30 | else: 31 | res = text[0:text.index(end)] 32 | else: 33 | res = text 34 | return res 35 | 36 | 37 | class JdMessage(object): 38 | def __init__(self): 39 | self.code = "" 40 | self.code_description = "" 41 | self.cookies = "" 42 | self.qr_captcha = "" 43 | 44 | 45 | class JdRequest(object): 46 | def __init__(self): 47 | self.web_driver_pool = None # type: Queue 48 | 49 | def init_pool(self): 50 | logger.info('init web driver pool...') 51 | self.web_driver_pool = get_web_driver_pool(1) 52 | logger.info('init web driver pool success...') 53 | 54 | def login(self, account, password): 55 | message = JdMessage() 56 | 57 | web = self.web_driver_pool.get() # type: webdriver.PhantomJS 58 | web.delete_all_cookies() 59 | 60 | web.get("https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F") 61 | element = web.find_element_by_css_selector("div.login-tab.login-tab-r").find_element_by_css_selector("a") 62 | element.click() 63 | element = web.find_element_by_id("loginname") 64 | element.clear() 65 | element.send_keys(account) 66 | element = web.find_element_by_id("nloginpwd") 67 | element.clear() 68 | element.send_keys(password) 69 | element = web.find_element_by_css_selector("a#loginsubmit") 70 | element.click() 71 | time.sleep(3) 72 | 73 | if '我的京东' in bs(web.execute_script("return document.documentElement.outerHTML"), 'lxml').title.string: 74 | message.code = jd_code.SUCCESS 75 | message.code_description = "登录成功" 76 | message.cookies = formart_selenium_cookies(web.get_cookies()) 77 | else: 78 | # 需要手机验证码等等状况 79 | pass 80 | 81 | self.web_driver_pool.put(web) 82 | return message 83 | 84 | def qr_login(self): 85 | message = JdMessage() 86 | headers = dict() 87 | headers[ 88 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 89 | headers["Accept"] = "*/*" 90 | headers["Accept-Encoding"] = "gzip, deflate" 91 | headers["Accept-Language"] = "zh-CN,en,*" 92 | headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F" 93 | session = requests.Session() 94 | response = session.get("https://qr.m.jd.com/show?appid=133&size=147&t=" + str(time.time())) 95 | 96 | message.code = jd_code.SUCCESS 97 | message.qr_captcha = response.content.encode("base64") 98 | message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape") 99 | return message 100 | 101 | def submit_qrlogin(self, cookies): 102 | message = JdMessage() 103 | 104 | headers = dict() 105 | headers[ 106 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 107 | headers["Accept"] = "*/*" 108 | headers["Accept-Encoding"] = "gzip, deflate" 109 | headers["Accept-Language"] = "zh-CN,en,*" 110 | headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F" 111 | session = requests.Session() 112 | 113 | response = session.get("https://qr.m.jd.com/check?callback=jQuery6172296&appid=133&_=1486609849337", 114 | cookies=json.loads(cookies), 115 | headers=headers) 116 | 117 | ticket = abstract(response.content, '\"ticket\" : \"', '\"') 118 | 119 | headers['X-Requested-With'] = 'XMLHttpRequest' 120 | response = session.get("https://passport.jd.com/uc/qrCodeTicketValidation?t=" + ticket, headers=headers) 121 | 122 | message.code = jd_code.SUCCESS 123 | message.code_description = "登录成功" 124 | message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape") 125 | 126 | return message 127 | -------------------------------------------------------------------------------- /sasila/system_instant/database/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/database/jd_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from sqlalchemy import Column, Integer, String, DateTime, create_engine 5 | from sqlalchemy.orm import sessionmaker 6 | from sqlalchemy.ext.declarative import declarative_base 7 | 8 | if sys.version_info < (3, 0): 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | # 创建对象的基类: 13 | Base = declarative_base() 14 | 15 | 16 | class Process(Base): 17 | # 表的名字: 18 | __tablename__ = 'crawler_flow_info' 19 | # 表的结构: 20 | collect_token = Column(String(100), primary_key=True) 21 | customer_id = Column(String(100)) 22 | token_valid_time = Column(Integer) 23 | token_create_time = Column(Integer) 24 | status = Column(String(10)) 25 | cookies = Column(String(5000)) 26 | 27 | 28 | class JdDatabase(object): 29 | def __init__(self): 30 | # 初始化数据库连接: 31 | self.engine = create_engine('mysql+mysqlconnector://root:root@192.168.3.210:3306/hiveengine') 32 | # 创建DBSession类型: 33 | self.DBSession = sessionmaker(bind=self.engine) 34 | self._create_all() 35 | 36 | def _create_all(self): 37 | ''' 38 | 创建从Base派生的所有表,如果数据表存在则忽视 39 | :return: 40 | ''' 41 | Base.metadata.create_all(self.engine) 42 | 43 | def _drop_all(self): 44 | ''' 45 | 删除DB中所有的表 46 | :return: 47 | ''' 48 | Base.metadata.drop_all(self.engine) 49 | 50 | def create_session(self): 51 | return self.DBSession() 52 | 53 | def query_cookie(self, collect_token): 54 | session = self.DBSession() 55 | cookies = session.query(Process).filter(Process.collect_token == collect_token).first().cookies 56 | session.close() 57 | return cookies 58 | 59 | def update_cookie(self, collect_token, cookies): 60 | session = self.DBSession() 61 | session.query(Process).filter(Process.collect_token == collect_token).update({ 62 | Process.cookies: cookies 63 | }) 64 | session.close() 65 | -------------------------------------------------------------------------------- /sasila/system_instant/manager/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_instant/manager/jd_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import datetime 4 | from sasila.system_normal.utils import jd_code 5 | import json 6 | from sasila.system_instant.crawler.jd.request import JdRequest 7 | from sasila.system_instant.database.jd_database import * 8 | 9 | if sys.version_info < (3, 0): 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | class JdResponse(object): 15 | def __init__(self, code, code_description, qr_captcha=None): 16 | self.code = code 17 | self.code_description = code_description 18 | self.qr_captcha = qr_captcha 19 | 20 | 21 | class JdManager(object): 22 | def __init__(self): 23 | self.database = JdDatabase() 24 | self.request = JdRequest() 25 | self.request.init_pool() 26 | 27 | def login(self, collect_token, account, password): 28 | message = self.request.login(account, password) 29 | if message.code == jd_code.SUCCESS: 30 | self.database.update_cookie(collect_token, message.cookies) 31 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode( 32 | 'unicode-escape') 33 | 34 | def qrlogin(self, collect_token): 35 | message = self.request.qr_login() 36 | if message.code == jd_code.SUCCESS: 37 | self.database.update_cookie(collect_token, message.cookies) 38 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description, 39 | qr_captcha=message.qr_captcha).__dict__).decode( 40 | 'unicode-escape') 41 | 42 | def submit_qrlogin(self, collect_token): 43 | cookies = self.database.query_cookie(collect_token) 44 | message = self.request.submit_qrlogin(cookies) 45 | if message.code == jd_code.SUCCESS: 46 | self.database.update_cookie(collect_token, message.cookies) 47 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode( 48 | 'unicode-escape') 49 | -------------------------------------------------------------------------------- /sasila/system_instant/settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/blueprints/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/blueprints/slow_spiders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from flask import Blueprint, request 5 | from sasila.system_normal.manager import manager 6 | import json 7 | 8 | if sys.version_info < (3, 0): 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | slow_spider = Blueprint('slow_spider', __name__) 13 | 14 | 15 | @slow_spider.route('/all') 16 | def get_all_spider(): 17 | return json.dumps(manager.get_all_spider()) 18 | 19 | 20 | @slow_spider.route('/find') 21 | def find_spider(spider_id): 22 | return json.dumps(manager.find_spider(spider_id)) 23 | 24 | 25 | @slow_spider.route('/start') 26 | def start_spider(): 27 | spider_id = request.args['spider_id'] 28 | manager.start_spider(spider_id) 29 | return 'start success:' + spider_id 30 | 31 | 32 | @slow_spider.route('/restart') 33 | def restart_spider(): 34 | spider_id = request.args['spider_id'] 35 | manager.stop_spider(spider_id) 36 | manager.restart_spider(spider_id) 37 | return 'restart success:' + spider_id 38 | 39 | 40 | @slow_spider.route('/stop') 41 | def stop_spider(): 42 | spider_id = request.args['spider_id'] 43 | manager.stop_spider(request.args['spider_id']) 44 | return 'stop success:' + spider_id 45 | 46 | 47 | @slow_spider.route('/detail') 48 | def get_spider_detail(): 49 | return manager.get_spider_detail(request.args['spider_id']) 50 | 51 | 52 | @slow_spider.route('/init') 53 | def init_system(): 54 | return json.dumps(manager.init_system()) 55 | -------------------------------------------------------------------------------- /sasila/system_normal/database/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/downloader/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/downloader/base_downloder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class BaseDownLoader(object): 11 | def __init__(self): 12 | self.loginer = None 13 | 14 | def download(self, request): 15 | pass 16 | 17 | def set_loginer(self, loginer): 18 | self.loginer = loginer 19 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/http/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/downloader/http/selenium_response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import re 5 | from posixpath import normpath 6 | 7 | if sys.version_info < (3, 0): 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | from urlparse import urljoin, urlparse, urlunparse 11 | else: 12 | from urllib.parse import urljoin, urlparse, urlunparse 13 | 14 | 15 | class SeleniumResponse(object): 16 | def __init__(self, m_response=None, request=None): 17 | self.request = request 18 | self.m_response = m_response 19 | 20 | def __str__(self): 21 | if self.m_response: 22 | return "" % (self.request.url, (float(len(self.m_response.content)) / 1000)) 23 | else: 24 | return "" % self.request.url 25 | 26 | def nice_join(self, url): 27 | url1 = urljoin(self.request.url, url) 28 | arr = urlparse(url1) 29 | path = normpath(arr[2]) 30 | return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment)) 31 | 32 | def is_url(self, url): 33 | if re.match(r'^https?:/{2}\w.+$', url): 34 | return True 35 | else: 36 | return False 37 | 38 | __repr__ = __str__ 39 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/http/spider_request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class Request(object): 11 | def __init__(self, url=None, data=None, json=None, headers=None, method="GET", cookies=None, meta=None, 12 | callback=None, 13 | errback=None, priority=0, allow_redirects=True, timeout=5, duplicate_remove=True): 14 | self.url = url 15 | self.data = data 16 | self.json = json 17 | self.headers = headers 18 | self.method = method 19 | self.allow_redirects = allow_redirects 20 | if not meta: 21 | self.meta = {} 22 | else: 23 | self.meta = meta 24 | self.cookies = cookies 25 | self.callback = callback 26 | self.priority = priority 27 | self.duplicate_remove = duplicate_remove 28 | self.timeout = timeout 29 | self.errback = errback 30 | 31 | def __str__(self): 32 | return "" % (self.method, self.url) 33 | 34 | __repr__ = __str__ 35 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/http/spider_response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import re 5 | from posixpath import normpath 6 | 7 | from requests.models import Response as Response_name 8 | 9 | if sys.version_info < (3, 0): 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | from urlparse import urljoin, urlparse, urlunparse 13 | else: 14 | from urllib.parse import urljoin, urlparse, urlunparse 15 | 16 | 17 | class Response(object): 18 | def __init__(self, m_response=None, request=None): 19 | self.request = request 20 | self.m_response = m_response 21 | 22 | def __str__(self): 23 | if isinstance(self.m_response, Response_name): 24 | if self.m_response: 25 | return "" % ( 26 | self.m_response.status_code, self.m_response.url, (float(len(self.m_response.content)) / 1000)) 27 | else: 28 | return "" % self.request.url 29 | else: 30 | return "" % self.request.url 31 | 32 | def nice_join(self, url): 33 | url1 = urljoin(self.request.url, url) 34 | arr = urlparse(url1) 35 | path = normpath(arr[2]) 36 | return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment)) 37 | 38 | def is_url(self, url): 39 | if re.match(r'^https?:/{2}\w.+$', url): 40 | return True 41 | else: 42 | return False 43 | 44 | __repr__ = __str__ 45 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/proxy/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/proxy/proxy_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.settings.default_settings import PROXY_PATH_REQUEST 6 | 7 | if sys.version_info < (3, 0): 8 | import Queue 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | else: 12 | from queue import Queue 13 | 14 | 15 | class ProxyPool(object): 16 | def __init__(self): 17 | self.queue = Queue.Queue() 18 | with open(PROXY_PATH_REQUEST, 'r') as f: 19 | lines = f.readlines() 20 | self.len = len(lines) 21 | for line in lines: 22 | info = line.strip().split(',') 23 | proxy = {} 24 | if len(info) == 2: 25 | proxy = {"http": "http://%s:%s" % (info[0], info[1]), 26 | "https": "http://%s:%s" % (info[0], info[1])} 27 | elif len(info) == 4: 28 | proxy = {"http": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1]), 29 | "https": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1])} 30 | self.queue.put(proxy) 31 | 32 | def __len__(self): 33 | return self.len 34 | 35 | def getProxy(self): 36 | proxy = self.queue.get() 37 | self.queue.put(proxy) 38 | return proxy 39 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/requests_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import grequests 5 | import requests 6 | from requests.adapters import HTTPAdapter 7 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader 8 | from sasila.system_normal.downloader.http.spider_response import Response 9 | from sasila.system_normal.downloader.proxy.proxy_pool import ProxyPool 10 | 11 | from sasila.system_normal.utils import logger 12 | 13 | if sys.version_info < (3, 0): 14 | reload(sys) 15 | sys.setdefaultencoding('utf-8') 16 | 17 | 18 | class RequestsDownLoader(BaseDownLoader): 19 | # proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",} 20 | 21 | def __init__(self, loginer=None, use_proxy=False): 22 | self.loginer = loginer 23 | self.use_proxy = use_proxy 24 | if use_proxy: 25 | self.proxy_pool = ProxyPool() 26 | if len(self.proxy_pool) == 0: 27 | self.use_proxy = False 28 | self._cookies = None 29 | 30 | self._headers = dict() 31 | self._headers[ 32 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 33 | self._headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" 34 | self._headers["Accept-Encoding"] = "gzip, deflate, sdch" 35 | self._headers["Accept-Language"] = "zh-CN,zh;q=0.8" 36 | self._request_retry = HTTPAdapter(max_retries=3) 37 | 38 | cookie_dict = dict() 39 | self._cookies = cookie_dict 40 | 41 | def init_loginer(self, account, password): 42 | self._cookies = self.loginer.logint(account, password) 43 | 44 | def download(self, batch): 45 | batch_requests = [] 46 | 47 | for request in batch: 48 | session = requests.session() 49 | session.mount('https://', self._request_retry) 50 | session.mount('http://', self._request_retry) 51 | 52 | if not request.headers: 53 | request.headers = self._headers 54 | session.headers = self._headers 55 | 56 | if request.method.upper() == "GET": 57 | if self.use_proxy: 58 | m_proxies = self.proxy_pool.getProxy() 59 | batch_requests.append(grequests.get( 60 | session=session, 61 | url=request.url, 62 | headers=request.headers, 63 | cookies=self._cookies, 64 | verify=False, 65 | allow_redirects=request.allow_redirects, 66 | timeout=request.timeout, 67 | proxies=m_proxies 68 | )) 69 | else: 70 | batch_requests.append(grequests.get( 71 | session=session, 72 | url=request.url, 73 | headers=request.headers, 74 | cookies=self._cookies, 75 | verify=False, 76 | allow_redirects=request.allow_redirects, 77 | timeout=request.timeout 78 | )) 79 | elif request.method.upper() == "POST": 80 | if self.use_proxy: 81 | m_proxies = self.proxy_pool.getProxy() 82 | batch_requests.append(grequests.post( 83 | session=session, 84 | url=request.url, 85 | data=request.data, 86 | json=request.json, 87 | headers=request.headers, 88 | cookies=self._cookies, 89 | verify=False, 90 | allow_redirects=request.allow_redirects, 91 | timeout=request.timeout, 92 | proxies=m_proxies 93 | )) 94 | else: 95 | batch_requests.append(grequests.post( 96 | session=session, 97 | url=request.url, 98 | data=request.data, 99 | json=request.json, 100 | headers=request.headers, 101 | cookies=self._cookies, 102 | verify=False, 103 | allow_redirects=request.allow_redirects, 104 | timeout=request.timeout 105 | )) 106 | else: 107 | pass 108 | 109 | rets = grequests.map(batch_requests, exception_handler=exception_handler) 110 | 111 | true_responses = [] 112 | index = 0 113 | for ret in rets: 114 | true_response = Response( 115 | m_response=ret, 116 | request=batch[index], 117 | ) 118 | true_responses.append(true_response) 119 | logger.info(true_response) 120 | index += 1 121 | 122 | return true_responses 123 | 124 | 125 | def exception_handler(request, exception): 126 | logger.error("%s %s" % (request.url, exception)) 127 | 128 | 129 | if __name__ == "__main__": 130 | proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",} 131 | requests.post(url="http://www.jd.com", data={"123": "fdsgs"}) 132 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/selenium_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.settings import default_settings 6 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader 7 | from sasila.system_normal.downloader.http.selenium_response import SeleniumResponse 8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool 9 | from sasila.system_normal.utils import logger 10 | from multiprocessing.pool import ThreadPool as Pool 11 | 12 | if sys.version_info < (3, 0): 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | 17 | class SeleniumDownLoader(BaseDownLoader): 18 | def __init__(self, driver_pool_size=None): 19 | self.driver_pool_size = driver_pool_size 20 | logger.info("init web driver pool...") 21 | if driver_pool_size: 22 | self.web_driver_pool = get_web_driver_pool(driver_pool_size) 23 | else: 24 | self.web_driver_pool = get_web_driver_pool(default_settings.DRIVER_POOL_SIZE) 25 | logger.info("init web driver pool success") 26 | 27 | def download_one(self, request): 28 | web = self.web_driver_pool.get() # type:WebDriver 29 | web.get(request.url) 30 | m_response = m_object() 31 | m_response.content = web.execute_script("return document.documentElement.outerHTML") 32 | response = SeleniumResponse(m_response=m_response, request=request) 33 | self.web_driver_pool.put(web) 34 | return response 35 | 36 | def download(self, batch): 37 | if self.driver_pool_size: 38 | pool = Pool(processes=self.driver_pool_size) 39 | else: 40 | pool = Pool(processes=default_settings.DRIVER_POOL_SIZE) 41 | 42 | results = [] 43 | 44 | for request in batch: 45 | results.append(pool.apply_async(self.download_one, (request,))) 46 | pool.close() 47 | pool.join() 48 | 49 | true_responses = [] 50 | for result in results: 51 | true_response = result.get() 52 | true_responses.append(true_response) 53 | logger.info(true_response) 54 | 55 | return true_responses 56 | 57 | 58 | class m_object(object): 59 | pass 60 | -------------------------------------------------------------------------------- /sasila/system_normal/downloader/web_driver_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | 6 | from selenium import webdriver 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 8 | 9 | from sasila.settings import default_settings 10 | 11 | if sys.version_info < (3, 0): 12 | import Queue 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | else: 16 | from queue import Queue 17 | 18 | dcap = dict(DesiredCapabilities.PHANTOMJS) 19 | dcap["phantomjs.page.settings.resourceTimeout"] = 10 20 | dcap["phantomjs.page.settings.loadImages"] = True 21 | dcap["phantomjs.page.settings.userAgent"] = default_settings.USER_AGENT 22 | 23 | 24 | def _get_base_driver(): 25 | if default_settings.PHANTOMJS_SERVICE: 26 | web = webdriver.PhantomJS(service_args=default_settings.PHANTOMJS_SERVICE, executable_path=default_settings.PHANTOMJS_PATH 27 | , desired_capabilities=dcap) 28 | else: 29 | web = webdriver.PhantomJS(executable_path=default_settings.PHANTOMJS_PATH 30 | , desired_capabilities=dcap) 31 | return web 32 | 33 | 34 | def get_web_driver_pool(num): 35 | driver_queue = Queue.Queue() 36 | i = 0 37 | while i < num: 38 | web = _get_base_driver() 39 | driver_queue.put(web) 40 | i += 1 41 | return driver_queue 42 | -------------------------------------------------------------------------------- /sasila/system_normal/loginer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/loginer/base_loginer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class BaseLoginer(object): 11 | def login(self, account, password): 12 | cookies = "" 13 | return cookies 14 | -------------------------------------------------------------------------------- /sasila/system_normal/loginer/jd_loginer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/manager/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from sasila.system_normal.manager.spider_manager import SpiderManager 4 | import sys 5 | 6 | if sys.version_info < (3, 0): 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | manager = SpiderManager() 11 | -------------------------------------------------------------------------------- /sasila/system_normal/manager/spider_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import json 5 | import threading 6 | 7 | if sys.version_info < (3, 0): 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | 11 | 12 | class SpiderManager(object): 13 | def __init__(self): 14 | self.spider_list = dict() 15 | 16 | def set_spider(self, spider): 17 | self.spider_list[spider._spider_id] = spider 18 | 19 | def del_spider(self, spider_id): 20 | if spider_id in self.spider_list.keys(): 21 | self.spider_list[spider_id].stop() 22 | del self.spider_list[spider_id] 23 | 24 | def init_system(self): 25 | pass 26 | 27 | def get_all_spider(self): 28 | return json.dumps(self.spider_list.keys()) 29 | 30 | def find_spider(self, spider_id): 31 | pass 32 | 33 | def start_spider(self, spider_id): 34 | if self.spider_list[spider_id]._spider_status == "stopped": 35 | thread = threading.Thread(target=self.spider_list[spider_id].start) 36 | thread.setDaemon(True) 37 | thread.start() 38 | 39 | def restart_spider(self, spider_id): 40 | thread = threading.Thread(target=self.spider_list[spider_id].restart) 41 | thread.setDaemon(True) 42 | thread.start() 43 | 44 | def stop_spider(self, spider_id): 45 | self.spider_list[spider_id].stop() 46 | 47 | def get_spider_detail(self, spider_id): 48 | return str(self.spider_list[spider_id]._process_count) 49 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/base_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class ItemPipeline(object): 11 | def process_item(self, item): 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/console_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 5 | import json 6 | 7 | if sys.version_info < (3, 0): 8 | reload(sys) 9 | sys.setdefaultencoding('utf-8') 10 | 11 | 12 | class ConsolePipeline(ItemPipeline): 13 | def process_item(self, item): 14 | if sys.version_info < (3, 0): 15 | print(json.dumps(item).decode("unicode-escape")) 16 | else: 17 | print(json.dumps(item).encode('utf8').decode("unicode-escape")) 18 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/kafa_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # import sys 4 | # from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 5 | # import json 6 | # from sasila.system_normal.utils.kafka_utils import send_message 7 | # 8 | # reload(sys) 9 | # sys.setdefaultencoding('utf-8') 10 | # 11 | # 12 | # class KafkaPipeline(ItemPipeline): 13 | # def process_item(self, item): 14 | # send_message("dataCollectionTopic", bytes("CompanyConsummer__" + json.dumps(item).decode("unicode-escape"))) 15 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/pic_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | import uuid 6 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 7 | 8 | if sys.version_info < (3, 0): 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | 13 | class PicPipeline(ItemPipeline): 14 | def process_item(self, item): 15 | if item is not None: 16 | if not os.path.exists("img"): 17 | os.mkdir("img") 18 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs: 19 | fs.write(item) 20 | print("download success!") 21 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/pipe_item.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | if sys.version_info < (3, 0): 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') 7 | 8 | 9 | class pipeItem(object): 10 | def __init__(self, pipenames=[], result=None): 11 | self.pipenames = pipenames 12 | self.result = result 13 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/test_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 5 | 6 | if sys.version_info < (3, 0): 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class TestPipeline(ItemPipeline): 12 | def __init__(self): 13 | self.result = {} 14 | 15 | def process_item(self, item): 16 | self.result = item 17 | -------------------------------------------------------------------------------- /sasila/system_normal/pipeline/text_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline 5 | from sasila.system_normal.utils import logger 6 | import traceback 7 | import codecs 8 | 9 | if sys.version_info < (3, 0): 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | class TextPipeline(ItemPipeline): 15 | def process_item(self, item): 16 | with open("result.txt", 'a') as f: 17 | f.write( 18 | item["province"] + ',' + 19 | item["city"] + ',' + 20 | item["company_name"] + ',' + 21 | item["company_man"] + ',' + 22 | item["company_telephone"] + ',' + 23 | item["company_address"] + ',' + 24 | item["company_registered_capital"] + ',' + 25 | item["company_registered_time"] + ',' + 26 | item["company_status"] + ',' + 27 | item["source"] + ',' + 28 | item["update_time"] + "\n" 29 | ) 30 | 31 | 32 | class TextPipelineCar(ItemPipeline): 33 | def process_item(self, item): 34 | try: 35 | with codecs.open("result.csv", 'a', 'gbk') as f: 36 | f.write( 37 | item["province"] + ',' + 38 | item["city"] + ',' + 39 | item["brand"].replace(u'\u30fb', '·') + ',' + 40 | item["cars_line"].replace(u'\u30fb', '·') + ',' + 41 | item["car"].replace(u'\u30fb', '·') + ',' + 42 | item["mileage"] + ',' + 43 | item["first_borad_date"] + ',' + 44 | item["gear"] + ',' + 45 | item["displacement"] + ',' + 46 | item["price"] + ',' + 47 | item["crawl_date"] + "\n" 48 | ) 49 | except: 50 | logger.error(traceback.format_exc()) 51 | 52 | 53 | class TextPipelineFang(ItemPipeline): 54 | def process_item(self, item): 55 | try: 56 | with codecs.open("fang.csv", 'a', 'gbk') as f: 57 | f.write( 58 | item["province"] + ',' + 59 | item["city"] + ',' + 60 | item["district"] + ',' + 61 | item["avg_price"] + ',' + 62 | item["estate"].replace(',', ',') + ',' + 63 | item["area"] + ',' + 64 | item["layout"] + ',' + 65 | item["total_price"] + ',' + 66 | item["crawl_date"] + ',' + 67 | item["url"] + "\n" 68 | ) 69 | except: 70 | logger.error(traceback.format_exc()) 71 | 72 | 73 | class TextPipelineFangShop(ItemPipeline): 74 | def process_item(self, item): 75 | try: 76 | with codecs.open("fang_shop.csv", 'a', 'gbk') as f: 77 | f.write( 78 | item["city"] + ',' + 79 | item["district"] + ',' + 80 | item["estate"].replace(',', ',') + ',' + 81 | item["floor"] + ',' + 82 | item["total_floor"] + ',' + 83 | item["type"] + ',' + 84 | item["area"] + ',' + 85 | item["total_price"] + ',' + 86 | item["crawl_date"] + ',' + 87 | item["url"] + "\n" 88 | ) 89 | except: 90 | logger.error(traceback.format_exc()) 91 | 92 | 93 | class TextPipelineBendibao(ItemPipeline): 94 | def process_item(self, item): 95 | try: 96 | with codecs.open("bendibao.csv", 'a', 'gbk') as f: 97 | f.write( 98 | item["city_name"] + ',' + 99 | item["category1_name"] + ',' + 100 | item["category2_name"] + ',' + 101 | item["result_name"] + ',' + 102 | item["result_mobile"] + "\n" 103 | ) 104 | except: 105 | logger.error(traceback.format_exc()) 106 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | if sys.version_info < (3, 0): 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /sasila/system_normal/processor/base_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import re 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.downloader.http.spider_request import Request 7 | from sasila.system_normal.utils.decorator import checkResponse 8 | 9 | if sys.version_info < (3, 0): 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | def identity(x): 15 | return x 16 | 17 | 18 | class Rule(object): 19 | def __init__(self, link_extractor, callback=None, process_request=identity, priority=0, only_first=False): 20 | self.link_extractor = link_extractor 21 | self.callback = callback 22 | self.process_request = process_request 23 | self.priority = priority 24 | self.only_first = only_first 25 | 26 | 27 | class LinkExtractor(object): 28 | def __init__(self, regex_str=None, css_str=None, process_value=None): 29 | if regex_str: 30 | self.regex = re.compile(regex_str) 31 | else: 32 | self.regex = None 33 | self.css_str = css_str 34 | self.process_value = process_value 35 | 36 | @checkResponse 37 | def extract_links(self, response): 38 | if self.process_value: 39 | return [response.nice_join(link) for link in self.process_value(response.m_response.content)] 40 | elif self.regex: 41 | return [response.nice_join(link) for link in self.regex.findall(response.m_response.content)] 42 | elif self.css_str: 43 | soup = bs(response.m_response.content, 'lxml') 44 | tags = soup.select(self.css_str) 45 | return [response.nice_join(tag.attrs["href"]) for tag in tags] 46 | 47 | 48 | class BaseProcessor(object): 49 | spider_id = None 50 | spider_name = None 51 | start_requests = [] 52 | rules = () 53 | allowed_domains = [] 54 | 55 | @checkResponse 56 | def process(self, response): 57 | if hasattr(self, 'rules'): 58 | rules = getattr(self, 'rules', None) 59 | else: 60 | rules = () 61 | for rule in rules: 62 | links = rule.link_extractor.extract_links(response) 63 | if links: 64 | for link in links: 65 | request = Request(url=link, callback=rule.callback, priority=rule.priority) 66 | request = rule.process_request(request) 67 | yield request 68 | if rule.only_first: 69 | break 70 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/bendibao_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineBendibao 9 | 10 | from sasila.system_normal.processor.base_processor import BaseProcessor 11 | from sasila.system_normal.downloader.http.spider_request import Request 12 | from sasila.system_normal.utils.decorator import checkResponse 13 | 14 | if sys.version_info < (3, 0): 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | 18 | start_requests_temp = [] 19 | 20 | with open(name='city.txt', mode='r') as fs: 21 | lines = fs.readlines() 22 | for line in lines: 23 | request_temp = Request(url=line.strip().split(',')[0] + 'wangdian/', priority=0) 24 | request_temp.meta["city_name"] = line.strip().split(',')[1] 25 | start_requests_temp.append(request_temp) 26 | 27 | 28 | class Bendibao_Processor(BaseProcessor): 29 | spider_id = 'bendibao_spider' 30 | spider_name = 'bendibao_spider' 31 | allowed_domains = ['bendibao.com'] 32 | start_requests = start_requests_temp 33 | 34 | @checkResponse 35 | def process(self, response): 36 | soup = bs(response.m_response.content, 'lxml') 37 | category1 = soup.select('div.navlink') 38 | for category in category1: 39 | category1_name = category.select('div.title h2')[0].text 40 | category_2 = category.select('ul.topic li a') 41 | for category_2_one in category_2: 42 | url = response.nice_join(category_2_one['href']) + '/' 43 | category_2_name = category_2_one.text 44 | request = Request(url=url, priority=1, callback=self.process_page_1) 45 | request.meta['city_name'] = response.request.meta['city_name'] 46 | request.meta['category1_name'] = category1_name 47 | request.meta['category2_name'] = category_2_name 48 | yield request 49 | 50 | @checkResponse 51 | def process_page_1(self, response): 52 | if '下暂无网点信息' not in response.m_response.content: 53 | soup = bs(response.m_response.content, 'lxml') 54 | results = soup.select('ul.catalist li') 55 | for result in results: 56 | result_name = result.select("div.infoschema h3 a")[0].text 57 | result_mobile = result.find(lambda tag: tag.name == 'p' and '电话:' in tag.text).text 58 | m_result = dict() 59 | m_result['result_name'] = result_name 60 | m_result['result_mobile'] = result_mobile.replace('电话:', '') 61 | m_result['city_name'] = response.request.meta['city_name'] 62 | m_result['category1_name'] = response.request.meta['category1_name'] 63 | m_result['category2_name'] = response.request.meta['city_name'] 64 | yield m_result 65 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text) 66 | if next_page: 67 | url_splits = response.request.url.split('/') 68 | url_splits[-1] = next_page['href'] 69 | url = '/'.join(url_splits) 70 | request = Request(url=url, priority=1, callback=self.process_page_1) 71 | request.meta['city_name'] = response.request.meta['city_name'] 72 | request.meta['category1_name'] = response.request.meta['category1_name'] 73 | request.meta['category2_name'] = response.request.meta['category2_name'] 74 | yield request 75 | 76 | 77 | if __name__ == '__main__': 78 | SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline(TextPipelineBendibao()).set_pipeline( 79 | ConsolePipeline()).start() 80 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/car_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | 9 | from sasila.system_normal.processor.base_processor import BaseProcessor 10 | from sasila.system_normal.downloader.http.spider_request import Request 11 | from sasila.system_normal.utils.decorator import checkResponse 12 | import json 13 | import time 14 | 15 | if sys.version_info < (3, 0): 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class Car_Processor(BaseProcessor): 21 | spider_id = 'car_spider' 22 | spider_name = 'car_spider' 23 | allowed_domains = ['che168.com'] 24 | start_requests = [Request(url='http://www.che168.com', priority=0)] 25 | 26 | @checkResponse 27 | def process(self, response): 28 | soup = bs(response.m_response.content, 'lxml') 29 | province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear') 30 | for province_div in province_div_list: 31 | province_name = province_div.select('span.capital a')[0].text 32 | city_list = province_div.select('div.city a') 33 | for city in city_list: 34 | city_name = city.text 35 | pinyin = city['href'].strip('/').split('/')[0] 36 | request = Request( 37 | url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin, 38 | priority=1, callback=self.process_page_1) 39 | request.meta['province'] = province_name 40 | request.meta['city'] = city_name 41 | yield request 42 | 43 | @checkResponse 44 | def process_page_1(self, response): 45 | brand_list = list(json.loads(response.m_response.content.decode('gb2312'))) 46 | for brand in brand_list: 47 | brand_dict = dict(brand) 48 | brand_name = brand_dict['name'] 49 | url = response.nice_join(brand_dict['url']) + '/' 50 | request = Request(url=url, priority=2, callback=self.process_page_2) 51 | request.meta['province'] = response.request.meta['province'] 52 | request.meta['city'] = response.request.meta['city'] 53 | request.meta['brand'] = brand_name 54 | yield request 55 | 56 | @checkResponse 57 | def process_page_2(self, response): 58 | soup = bs(response.m_response.content, 'lxml') 59 | cars_line_list = soup.select('div#series div.content-area dl.model-list dd a') 60 | for cars_line in cars_line_list: 61 | cars_line_name = cars_line.text 62 | url = 'http://www.che168.com' + cars_line['href'] 63 | request = Request(url=url, priority=3, callback=self.process_page_3) 64 | request.meta['province'] = response.request.meta['province'] 65 | request.meta['city'] = response.request.meta['city'] 66 | request.meta['brand'] = response.request.meta['brand'] 67 | request.meta['cars_line'] = cars_line_name 68 | yield request 69 | 70 | @checkResponse 71 | def process_page_3(self, response): 72 | soup = bs(response.m_response.content, 'lxml') 73 | car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo') 74 | for car_info in car_info_list: 75 | url = 'http://www.che168.com' + car_info['href'] 76 | request = Request(url=url, priority=4, callback=self.process_page_4) 77 | request.meta['province'] = response.request.meta['province'] 78 | request.meta['city'] = response.request.meta['city'] 79 | request.meta['brand'] = response.request.meta['brand'] 80 | request.meta['cars_line'] = response.request.meta['cars_line'] 81 | yield request 82 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text) 83 | if next_page: 84 | url = 'http://www.che168.com' + next_page['href'] 85 | request = Request(url=url, priority=3, callback=self.process_page_3) 86 | request.meta['province'] = response.request.meta['province'] 87 | request.meta['city'] = response.request.meta['city'] 88 | request.meta['brand'] = response.request.meta['brand'] 89 | request.meta['cars_line'] = response.request.meta['cars_line'] 90 | yield request 91 | 92 | @checkResponse 93 | def process_page_4(self, response): 94 | soup = bs(response.m_response.content, 'lxml') 95 | # Object moved 96 | #

Object moved to here.

97 | # 98 | if len(soup.select('div.car-title h2')) != 0: 99 | car = soup.select('div.car-title h2')[0].text 100 | detail_list = soup.select('div.details li') 101 | if len(detail_list) == 0: 102 | soup = bs(response.m_response.content, 'html5lib') 103 | detail_list = soup.select('div.details li') 104 | mileage = detail_list[0].select('span')[0].text.replace('万公里', '') 105 | first_borad_date = detail_list[1].select('span')[0].text 106 | gear = detail_list[2].select('span')[0].text.split('/')[0] 107 | displacement = detail_list[2].select('span')[0].text.split('/')[1] 108 | price = soup.select('div.car-price ins')[0].text.replace('¥', '') 109 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 110 | 111 | item = dict() 112 | item['car'] = car 113 | item['mileage'] = mileage 114 | item['first_borad_date'] = first_borad_date 115 | item['gear'] = gear 116 | item['displacement'] = displacement 117 | item['price'] = price 118 | item['crawl_date'] = crawl_date 119 | 120 | item['province'] = response.request.meta['province'] 121 | item['city'] = response.request.meta['city'] 122 | item['brand'] = response.request.meta['brand'] 123 | item['cars_line'] = response.request.meta['cars_line'] 124 | yield item 125 | 126 | 127 | if __name__ == '__main__': 128 | SpiderCore(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).start() 129 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/city.txt: -------------------------------------------------------------------------------- 1 | http://cd.bendibao.com/,成都 2 | http://my.bendibao.com/,绵阳 3 | http://deyang.bendibao.com/,德阳 4 | http://nanchong.bendibao.com/,南充 5 | http://yb.bendibao.com/,宜宾 6 | http://zg.bendibao.com/,自贡 7 | http://leshan.bendibao.com/,乐山 8 | http://luzhou.bendibao.com/,泸州 9 | http://dazhou.bendibao.com/,达州 10 | http://neijiang.bendibao.com/,内江 11 | http://suining.bendibao.com/,遂宁 12 | http://pzh.bendibao.com/,攀枝花 13 | http://ms.bendibao.com/,眉山 14 | http://ga.bendibao.com/,广安 15 | http://zy.bendibao.com/,资阳 16 | http://liangshan.bendibao.com/,凉山 17 | http://guangyuan.bendibao.com/,广元 18 | http://ya.bendibao.com/,雅安 19 | http://bazhong.bendibao.com/,巴中 20 | http://xichang.bendibao.com/,西昌 21 | http://ab.bendibao.com/,阿坝 22 | http://ganzi.bendibao.com/,甘孜 -------------------------------------------------------------------------------- /sasila/system_normal/processor/city_location_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.system_normal.spider.spider_core import SpiderCore 6 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor 7 | from sasila.system_normal.downloader.http.spider_request import Request 8 | from bs4 import BeautifulSoup as bs 9 | 10 | if sys.version_info < (3, 0): 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | 14 | 15 | class CityLocationProcessor(BaseProcessor): 16 | spider_id = 'city' 17 | spider_name = 'city' 18 | allowed_domains = ['supfree.net'] 19 | start_requests = [Request(url='http://jingwei.supfree.net/')] 20 | 21 | rules = ( 22 | Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0), 23 | Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"), priority=1, only_first=True, callback='save'), 24 | ) 25 | 26 | def save(self, response): 27 | if response.m_response: 28 | soup = bs(response.m_response.content, 'lxml') 29 | name = soup.select("div.cdiv p")[0].string.strip().split(' ') 30 | if len(name) > 2: 31 | province = name[0] 32 | city = name[1] 33 | area = name[2] 34 | elif len(name) > 1: 35 | province = name[0] 36 | city = name[0] 37 | area = name[1] 38 | else: 39 | province = name[0] 40 | city = name[0] 41 | area = name[0] 42 | lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip() 43 | la = soup.select("div.cdiv p")[1].select("span")[1].string.strip() 44 | data = province + ',' + city + ',' + area + ',' + lo + ',' + la 45 | print(data) 46 | with open('city.txt', 'a+') as fs: 47 | data = province + ',' + city + ',' + area + ',' + lo + ',' + la 48 | fs.write(data + '\n') 49 | print(data) 50 | 51 | 52 | # fe_spider = SpiderCore(CityLocationProcessor()) 53 | # if __name__ == '__main__': 54 | # fe_spider.start() 55 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/fang_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang 9 | 10 | from sasila.system_normal.processor.base_processor import BaseProcessor 11 | from sasila.system_normal.downloader.http.spider_request import Request 12 | import time 13 | from sasila.system_normal.utils.decorator import checkResponse 14 | 15 | if sys.version_info < (3, 0): 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class Fang_Processor(BaseProcessor): 21 | spider_id = 'fang_spider' 22 | spider_name = 'fang_spider' 23 | allowed_domains = ['fang.com'] 24 | start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)] 25 | 26 | @checkResponse 27 | def process(self, response): 28 | soup = bs(response.m_response.content, 'lxml') 29 | province_list = {u'山西'} 30 | province_div_list = soup.select('div#c02 ul li') 31 | for province_div in province_div_list: 32 | province_name = province_div.select('strong')[0].text 33 | if province_name != '其他': 34 | if province_name in province_list: 35 | city_list = province_div.select('a') 36 | for city in city_list: 37 | city_name = city.text 38 | url = city['href'] 39 | request = Request(url=url, priority=1, callback=self.process_page_1) 40 | request.meta['province'] = province_name 41 | request.meta['city'] = city_name 42 | yield request 43 | 44 | @checkResponse 45 | def process_page_1(self, response): 46 | soup = bs(response.m_response.content, 'lxml') 47 | district_list = soup.select('div.qxName a') 48 | district_list.pop(0) 49 | for district in district_list: 50 | district_name = district.text 51 | url = response.request.url + district['href'] 52 | request = Request(url=url, priority=2, callback=self.process_page_2) 53 | request.meta['province'] = response.request.meta['province'] 54 | request.meta['city'] = response.request.meta['city'] 55 | request.meta['district'] = district_name 56 | yield request 57 | 58 | @checkResponse 59 | def process_page_2(self, response): 60 | soup = bs(response.m_response.content, 'lxml') 61 | avg_price_list = soup.select('div.newcardR dl') 62 | if len(avg_price_list) > 0: 63 | avg_price = avg_price_list[1].select('dd b')[0].text 64 | else: 65 | avg_price = '未知' 66 | detail_list = soup.select('div.houseList dl') 67 | for detail in detail_list: 68 | if len(detail.select('p.mt10 a span')) != 0: 69 | estate = detail.select('p.mt10 a span')[0].text 70 | area = detail.select('div.area p')[0].text.replace('㎡', '') 71 | layout = detail.select('p.mt12')[0].text.split('|')[0].strip() 72 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text 73 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 74 | item = dict() 75 | item['avg_price'] = avg_price 76 | item['estate'] = estate 77 | item['area'] = area 78 | item['layout'] = layout 79 | item['total_price'] = total_price 80 | item['crawl_date'] = crawl_date 81 | 82 | item['province'] = response.request.meta['province'] 83 | item['city'] = response.request.meta['city'] 84 | item['district'] = response.request.meta['district'] 85 | item['url'] = response.request.url 86 | yield item 87 | 88 | next_page = soup.select('a#PageControl1_hlk_next') 89 | if len(next_page) > 0: 90 | url = response.nice_join(next_page[0]['href']) 91 | request = Request(url=url, priority=2, callback=self.process_page_2) 92 | request.meta['province'] = response.request.meta['province'] 93 | request.meta['city'] = response.request.meta['city'] 94 | request.meta['district'] = response.request.meta['district'] 95 | yield request 96 | 97 | 98 | if __name__ == '__main__': 99 | spider = SpiderCore(Fang_Processor(), test=True).set_pipeline(ConsolePipeline()).start() 100 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/fang_shop_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFangShop 9 | 10 | from sasila.system_normal.processor.base_processor import BaseProcessor 11 | from sasila.system_normal.downloader.http.spider_request import Request 12 | import time 13 | from sasila.system_normal.utils.decorator import checkResponse 14 | from sasila.system_normal.utils import logger 15 | 16 | if sys.version_info < (3, 0): 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | 20 | 21 | class Fang_Shop_Processor(BaseProcessor): 22 | spider_id = 'fang_shop_spider' 23 | spider_name = 'fang_shop_spider' 24 | allowed_domains = ['fang.com'] 25 | start_requests = [Request(url='http://shop.fang.com', priority=0)] 26 | 27 | @checkResponse 28 | def process(self, response): 29 | city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'} 30 | soup = bs('''北京 31 | 上海 32 | 广州 33 | 深圳 34 | 天津 35 | 重庆 36 | 成都 37 | 苏州 38 | 武汉 39 | 西安 40 | 东莞 41 | 昆明 42 | 杭州 43 | 济南 44 | 无锡 45 | 郑州 46 | 南昌 47 | 青岛 48 | 石家庄 49 | 南京 50 | 大连''', 'lxml') 51 | city__list = soup.select('a') 52 | for city in city__list: 53 | city_name = city.text 54 | if city_name in city_crawl_list: 55 | url = city['href'] 56 | request = Request(url=url, priority=1, callback=self.process_page_1) 57 | request.meta['city'] = city_name 58 | yield request 59 | 60 | @checkResponse 61 | def process_page_1(self, response): 62 | soup = bs(response.m_response.content, 'lxml') 63 | district_list = soup.select('div.qxName a') 64 | district_list.pop(0) 65 | for district in district_list: 66 | district_name = district.text 67 | url = response.request.url + district['href'] 68 | request = Request(url=url, priority=2, callback=self.process_page_2) 69 | request.meta['city'] = response.request.meta['city'] 70 | request.meta['district'] = district_name 71 | yield request 72 | 73 | @checkResponse 74 | def process_page_2(self, response): 75 | soup = bs(response.m_response.content, 'lxml') 76 | detail_list = soup.select('div.houseList dl') 77 | for detail in detail_list: 78 | estate = detail.select('p.mt15 span.spName')[0].text 79 | detail_str = detail.select('p.mt10')[0].text 80 | 81 | temp_list = detail.select('p.mt10')[0].text.split('/') 82 | temp_list = [temp.strip() for temp in temp_list] 83 | 84 | if '购物中心/百货' not in detail_str and '层' in detail_str: 85 | m_type = temp_list[0].replace('类型:', '') 86 | floor = temp_list[1] 87 | total_floor = temp_list[2].replace('层', '') 88 | elif '购物中心/百货' not in detail_str and '层' not in detail_str: 89 | m_type = temp_list[0].strip().replace('类型:', '') 90 | floor = '未知' 91 | total_floor = '未知' 92 | elif '购物中心/百货' in detail_str and '层' not in detail_str: 93 | m_type = temp_list[0].replace('类型:', '') + temp_list[1] 94 | floor = '未知' 95 | total_floor = '未知' 96 | elif '购物中心/百货' in detail_str and '层' in detail_str: 97 | m_type = temp_list[0].replace('类型:', '') + temp_list[1] 98 | floor = temp_list[2] 99 | total_floor = temp_list[3].replace('层', '') 100 | else: 101 | logger.error('unexpective detail_str: ' + detail_str.strip()) 102 | 103 | area = detail.select('div.area')[0].text.replace('㎡', '').replace('建筑面积', '') 104 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text 105 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 106 | 107 | item = dict() 108 | item['estate'] = estate 109 | item['floor'] = floor 110 | item['total_floor'] = total_floor 111 | item['type'] = m_type 112 | item['area'] = area 113 | item['total_price'] = total_price 114 | item['crawl_date'] = crawl_date 115 | 116 | item['city'] = response.request.meta['city'] 117 | item['district'] = response.request.meta['district'] 118 | item['url'] = response.request.url 119 | yield item 120 | 121 | next_page = soup.select('a#PageControl1_hlk_next') 122 | if len(next_page) > 0: 123 | url = response.nice_join(next_page[0]['href']) + '/' 124 | request = Request(url=url, priority=2, callback=self.process_page_2) 125 | request.meta['city'] = response.request.meta['city'] 126 | request.meta['district'] = response.request.meta['district'] 127 | yield request 128 | 129 | 130 | # if __name__ == '__main__': 131 | # spider = SpiderCore(Fang_Shop_Processor()).set_pipeline(ConsolePipeline()).set_pipeline( 132 | # TextPipelineFangShop()).start() 133 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/fe_loan_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.system_normal.spider.spider_core import SpiderCore 6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline 7 | 8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor 9 | from sasila.system_normal.downloader.http.spider_request import Request 10 | from bs4 import BeautifulSoup as bs 11 | 12 | if sys.version_info < (3, 0): 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | 17 | class FeProcessor(BaseProcessor): 18 | spider_id = 'fe' 19 | spider_name = 'fe' 20 | allowed_domains = ['58.com'] 21 | start_requests = [Request(url='http://www.58.com/daikuan/changecity/')] 22 | 23 | rules = ( 24 | Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0), 25 | Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1), 26 | Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'), 27 | ) 28 | 29 | def save(self, response): 30 | if response.m_response: 31 | print(bs(response.m_response.content, 'lxml').title.string) 32 | 33 | 34 | # fe_spider = SpiderCore(FeProcessor()).set_pipeline(PicPipeline()) 35 | # if __name__ == '__main__': 36 | # fe_spider.start() 37 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/first_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 8 | 9 | from sasila.system_normal.processor.base_processor import BaseProcessor 10 | from sasila.system_normal.downloader.http.spider_request import Request 11 | 12 | if sys.version_info < (3, 0): 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | 17 | class FirstProcessor(BaseProcessor): 18 | spider_id = 'test' 19 | spider_name = 'test' 20 | allowed_domains = ['mzitu.com'] 21 | start_requests = [Request(url="http://www.mzitu.com/")] 22 | 23 | def process(self, response): 24 | soup = bs(response.m_response.content, 'lxml') 25 | a_list = soup.select("a") 26 | for a in a_list: 27 | if "href" in a.attrs: 28 | url = response.nice_join(a["href"]) 29 | yield {'url': url} 30 | 31 | # if __name__ == '__main__': 32 | # spider = SpiderCore(FirstProcessor()).set_pipeline(ConsolePipeline()).start() 33 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/mzitu_proccessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from bs4 import BeautifulSoup as bs 6 | from sasila.system_normal.spider.spider_core import SpiderCore 7 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline 8 | 9 | from sasila.system_normal.processor.base_processor import BaseProcessor 10 | from sasila.system_normal.downloader.http.spider_request import Request 11 | 12 | if sys.version_info < (3, 0): 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | 17 | class MezituProcessor(BaseProcessor): 18 | spider_id = 'mzitu' 19 | spider_name = 'mzitu' 20 | allowed_domains = ['mzitu.com', 'meizitu.net'] 21 | start_requests = [Request(url='http://www.mzitu.com/xinggan')] 22 | 23 | def process(self, response): 24 | if response.m_response: 25 | soup = bs(response.m_response.content, "lxml") 26 | total_page = int(soup.select_one("a.next.page-numbers").find_previous_sibling().text) 27 | for page in range(1, total_page + 1): 28 | yield Request(url="http://www.mzitu.com/xinggan/page/" + str(page), callback=self.get_page_content) 29 | 30 | def get_page_content(self, response): 31 | if response.m_response: 32 | soup = bs(response.m_response.content, 'lxml') 33 | li_list = soup.select("div.postlist ul#pins li") 34 | for li in li_list: 35 | yield Request(url=li.select_one("a").attrs["href"], callback=self.get_pic, priority=1) 36 | 37 | def get_pic(self, response): 38 | if response.m_response: 39 | li_soup = bs(response.m_response.content, "lxml") 40 | if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) is not None: 41 | total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \ 42 | .find_previous_sibling().text) 43 | for page in range(1, total_page + 1): 44 | yield Request(url=response.request.url + "/" + str(page), callback=self.download_pic, priority=2) 45 | 46 | def download_pic(self, response): 47 | if response.m_response: 48 | href = bs(response.m_response.content, "lxml").select_one("div.main-image img").attrs["src"] 49 | yield Request(url=href, callback=self.download, priority=3) 50 | 51 | def download(self, response): 52 | if response.m_response: 53 | if response.m_response.status_code == 200: 54 | yield response.m_response.content 55 | 56 | 57 | # mzitu_spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline()) 58 | # 59 | # if __name__ == '__main__': 60 | # spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline()).start() 61 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/mzitu_proccessor_regex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.system_normal.spider.spider_core import SpiderCore 6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline 7 | 8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor 9 | from sasila.system_normal.downloader.http.spider_request import Request 10 | import os 11 | import uuid 12 | 13 | if sys.version_info < (3, 0): 14 | reload(sys) 15 | sys.setdefaultencoding('utf-8') 16 | 17 | 18 | class MezituProcessor(BaseProcessor): 19 | spider_id = 'mzitu' 20 | spider_name = 'mzitu' 21 | allowed_domains = ['mzitu.com', 'meizitu.net'] 22 | start_requests = [Request(url='http://www.mzitu.com/xinggan/')] 23 | 24 | rules = ( 25 | Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"), 26 | callback="save", priority=3), 27 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1), 28 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2), 29 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0), 30 | ) 31 | 32 | def save(self, response): 33 | if response.m_response: 34 | if not os.path.exists("img"): 35 | os.mkdir("img") 36 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs: 37 | fs.write(response.m_response.content) 38 | print("download success!") 39 | 40 | 41 | # if __name__ == '__main__': 42 | # spider = SpiderCore(MezituProcessor(), batch_size=10).set_pipeline(PicPipeline()).start() 43 | -------------------------------------------------------------------------------- /sasila/system_normal/processor/qcc_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | from sasila.system_normal.spider.spider_core import SpiderCore 6 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline 7 | from sasila.system_normal.pipeline.text_pipeline import TextPipeline 8 | from sasila.system_normal.processor.base_processor import BaseProcessor 9 | from sasila.system_normal.downloader.http.spider_request import Request 10 | from bs4 import BeautifulSoup as bs 11 | import time 12 | from sasila.system_normal.utils import logger 13 | 14 | import traceback 15 | 16 | if sys.version_info < (3, 0): 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | 20 | 21 | class QccProcessor(BaseProcessor): 22 | spider_id = 'qcc' 23 | spider_name = 'qcc' 24 | allowed_domains = ['qichacha.com'] 25 | 26 | start_requests = [ 27 | Request(url='http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE') 28 | ] 29 | 30 | def process(self, response): 31 | if not response.m_response: 32 | logger.error(response.request.url) 33 | yield response.request 34 | if '