├── .coveragerc
├── .travis.yml
├── LICENSE
├── README-SETUP.rst
├── README.md
├── pic
├── feijishi.png
├── jiagou.png
├── jichu.png
├── jigou.png
├── jishi-huoqushuju.png
├── jishi-shouquan.png
└── spider.jpg
├── requirements.txt
├── sasila-example
├── car_processor.py
├── fang_processor.py
├── main.py
├── proxy.txt
└── settings.py
├── sasila
├── __init__.py
├── settings
│ ├── __init__.py
│ └── default_settings.py
├── system_instant
│ ├── __init__.py
│ ├── blueprints
│ │ ├── __init__.py
│ │ └── jd.py
│ ├── crawler
│ │ ├── __init__.py
│ │ └── jd
│ │ │ ├── __init__.py
│ │ │ └── request.py
│ ├── database
│ │ ├── __init__.py
│ │ └── jd_database.py
│ ├── manager
│ │ ├── __init__.py
│ │ └── jd_manager.py
│ └── settings.py
├── system_normal
│ ├── __init__.py
│ ├── blueprints
│ │ ├── __init__.py
│ │ └── slow_spiders.py
│ ├── database
│ │ └── __init__.py
│ ├── downloader
│ │ ├── __init__.py
│ │ ├── base_downloder.py
│ │ ├── http
│ │ │ ├── __init__.py
│ │ │ ├── selenium_response.py
│ │ │ ├── spider_request.py
│ │ │ └── spider_response.py
│ │ ├── proxy
│ │ │ ├── __init__.py
│ │ │ └── proxy_pool.py
│ │ ├── requests_downloader.py
│ │ ├── selenium_downloader.py
│ │ └── web_driver_pool.py
│ ├── loginer
│ │ ├── __init__.py
│ │ ├── base_loginer.py
│ │ └── jd_loginer.py
│ ├── manager
│ │ ├── __init__.py
│ │ └── spider_manager.py
│ ├── pipeline
│ │ ├── __init__.py
│ │ ├── base_pipeline.py
│ │ ├── console_pipeline.py
│ │ ├── kafa_pipeline.py
│ │ ├── pic_pipeline.py
│ │ ├── pipe_item.py
│ │ ├── test_pipeline.py
│ │ └── text_pipeline.py
│ ├── processor
│ │ ├── __init__.py
│ │ ├── base_processor.py
│ │ ├── bendibao_processor.py
│ │ ├── car_processor.py
│ │ ├── city.txt
│ │ ├── city_location_processor.py
│ │ ├── fang_processor.py
│ │ ├── fang_shop_processor.py
│ │ ├── fe_loan_processor.py
│ │ ├── first_processor.py
│ │ ├── mzitu_proccessor.py
│ │ ├── mzitu_proccessor_regex.py
│ │ ├── qcc_processor.py
│ │ └── test_processor.py
│ ├── scheduler
│ │ ├── __init__.py
│ │ ├── bloom_filter.py
│ │ └── queue.py
│ ├── spider
│ │ ├── __init__.py
│ │ └── spider_core.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── cookie.py
│ │ ├── decorator.py
│ │ ├── httpobj.py
│ │ ├── jd_code.py
│ │ ├── kafka_utils.py
│ │ ├── progress_bar.py
│ │ ├── python.py
│ │ └── reqser.py
└── system_web
│ └── __init__.py
├── setup.py
└── tests
├── __init__.py
└── test_processor.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source =
3 | sasila
4 | parallel = True
5 |
6 | [report]
7 | omit =
8 | sasila/system_instant/*
9 | sasila/system_normal/processor/*
10 |
11 | exclude_lines =
12 | pragma: no cover
13 | def __repr__
14 | if self.debug:
15 | if settings.DEBUG
16 | raise AssertionError
17 | raise NotImplementedError
18 | if 0:
19 | if __name__ == .__main__.:
20 | except ImportError:
21 | pass
22 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | language: python
3 | cache: pip
4 | python:
5 | - "2.7"
6 | - "3.5"
7 | services:
8 | - redis-server
9 | install:
10 | - pip install --no-use-wheel lxml
11 | - pip install --allow-all-external -e .[test]
12 | - pip install coveralls
13 | script:
14 | - coverage run setup.py test
15 | after_success:
16 | - coverage combine
17 | - coveralls
18 |
19 | deploy:
20 | provider: pypi
21 | skip_cleanup: true
22 | distributions: "sdist bdist_wheel"
23 | user: 'DaVinciDW'
24 | password:
25 | secure: "Uh+Q37dRElSmZ1YxiGv9aeg59xyCJ6dSJ87L/P1dFowHx267dX4l9xU9v3skrQSBKEIe8JjCHHT3b9D//1daObWuJ1PHQ6IdD5pp7Lwl2CkBW1TOP9MjAcZV9F0udH3X986owP8KCuwoVJglLWch+3FtI7iNpdrlcIUXlgwS4eAAfF6DmUJG5BSiHDfCdvEyLE2D13MqyXqWNixU9FQ6/5IPfEVrJsW0W0s+fUnvPNSq/R4l9oHrkhUb+2oI7OwYcCG+wXz6KOZaSn69a/sOPRI3thfc9v2FWKsz+XvBhqvNA67q2Q1kHaIn+KZnct+ZJD2tK4NrEJznf4mBliLT31YVsvYHmnsfO34+3W5G+PVdywE2j63uKAFVRzWfYRBVD1UAr0yFuCPD3Ghh7GzHFXEZm5Tltbng2BZQT82BxY4B8IPgHUMf418wRiOBKDGPSoZiHBXVtjwbWez36HOaMenXurLMaoCDWsUzl4QIJF723L5fS/z5Xq8iOoMo+5bsEIfp6BpsYh33n1zL887p03IFJHRnFlCPjdZJ7cQnBV2HTPwUNrls6c8DzaMncUj5W203k48nHm6YhspeS+uIEIrz2eCOgYD5AjjeBRsZfXlG6+DC0+O7Srnuih61xR0vJXQ9PpYCoPI5BMgQo+xwJJz2BP5IX7IpZ2HWJHFKC0E="
26 | on:
27 | branch: master
28 | tags: false
29 | repo: DarkSand/Sasila
30 | condition: $TRAVIS_PYTHON_VERSION = "2.7"
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2014 Binux
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README-SETUP.rst:
--------------------------------------------------------------------------------
1 | Sasila
2 | ======
3 | .. image:: https://img.shields.io/badge/version-0.0.1-green.svg
4 | :target: https://pypi.python.org/pypi/Sasila
5 | :alt: Sasila Version
6 |
7 | .. image:: https://img.shields.io/badge/pypi-v1.4.0-green.svg
8 | :target: https://pypi.python.org/pypi/Sasila
9 | :alt: Wheel Status
10 |
11 | Overview
12 | ========
13 | sasila is a simple spider system
14 |
15 | Install
16 | =======
17 |
18 | The quick way::
19 |
20 | pip install sasila
21 |
22 | Tutorial
23 | =======
24 | car_processor.py::
25 |
26 | #!/usr/bin/env python
27 | # -*- coding: utf-8 -*-
28 | from sasila.system_normal.spider.spider_core import SpiderCore
29 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
30 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar
31 | from sasila.system_normal.processor.base_processor import BaseProcessor
32 | from sasila.system_normal.downloader.http.spider_request import Request
33 | from sasila.system_normal.utils.decorator import checkResponse
34 |
35 | from bs4 import BeautifulSoup as bs
36 | import json
37 | import time
38 | import sys
39 |
40 | reload(sys)
41 | sys.setdefaultencoding("utf-8")
42 |
43 |
44 | class Car_Processor(BaseProcessor):
45 | spider_id = "car_spider"
46 | spider_name = "car_spider"
47 | allowed_domains = ["che168.com"]
48 | start_requests = [Request(url="http://www.che168.com", priority=0)]
49 |
50 | @checkResponse
51 | def process(self, response):
52 | soup = bs(response.m_response.content, "lxml")
53 | province_div_list = soup.select("div.city-list div.cap-city > div.fn-clear")
54 | for province_div in province_div_list:
55 | province_name = province_div.select("span.capital a")[0].text
56 | city_list = province_div.select("div.city a")
57 | for city in city_list:
58 | city_name = city.text
59 | pinyin = city["href"].strip("/").split("/")[0]
60 | request = Request(
61 | url="http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s" % pinyin,
62 | priority=1, callback=self.process_page_1)
63 | request.meta["province"] = province_name
64 | request.meta["city"] = city_name
65 | yield request
66 |
67 | @checkResponse
68 | def process_page_1(self, response):
69 | brand_list = list(json.loads(response.m_response.content.decode("gb2312")))
70 | for brand in brand_list:
71 | brand_dict = dict(brand)
72 | brand_name = brand_dict["name"]
73 | url = response.nice_join(brand_dict["url"]) + "/"
74 | request = Request(url=url, priority=2, callback=self.process_page_2)
75 | request.meta["province"] = response.request.meta["province"]
76 | request.meta["city"] = response.request.meta["city"]
77 | request.meta["brand"] = brand_name
78 | yield request
79 |
80 | @checkResponse
81 | def process_page_2(self, response):
82 | soup = bs(response.m_response.content, "lxml")
83 | cars_line_list = soup.select("div#series div.content-area dl.model-list dd a")
84 | for cars_line in cars_line_list:
85 | cars_line_name = cars_line.text
86 | url = "http://www.che168.com" + cars_line["href"]
87 | request = Request(url=url, priority=3, callback=self.process_page_3)
88 | request.meta["province"] = response.request.meta["province"]
89 | request.meta["city"] = response.request.meta["city"]
90 | request.meta["brand"] = response.request.meta["brand"]
91 | request.meta["cars_line"] = cars_line_name
92 | yield request
93 |
94 | @checkResponse
95 | def process_page_3(self, response):
96 | soup = bs(response.m_response.content, "lxml")
97 | car_info_list = soup.select("div#a2 ul#viewlist_ul li a.carinfo")
98 | for car_info in car_info_list:
99 | url = "http://www.che168.com" + car_info["href"]
100 | request = Request(url=url, priority=4, callback=self.process_page_4)
101 | request.meta["province"] = response.request.meta["province"]
102 | request.meta["city"] = response.request.meta["city"]
103 | request.meta["brand"] = response.request.meta["brand"]
104 | request.meta["cars_line"] = response.request.meta["cars_line"]
105 | yield request
106 | next_page = soup.find(lambda tag: tag.name == "a" and "下一页" in tag.text)
107 | if next_page:
108 | url = "http://www.che168.com" + next_page["href"]
109 | request = Request(url=url, priority=3, callback=self.process_page_3)
110 | request.meta["province"] = response.request.meta["province"]
111 | request.meta["city"] = response.request.meta["city"]
112 | request.meta["brand"] = response.request.meta["brand"]
113 | request.meta["cars_line"] = response.request.meta["cars_line"]
114 | yield request
115 |
116 | @checkResponse
117 | def process_page_4(self, response):
118 | soup = bs(response.m_response.content, "lxml")
119 | #
Object moved
120 | # Object moved to here.
121 | #
122 | if len(soup.select("div.car-title h2")) != 0:
123 | car = soup.select("div.car-title h2")[0].text
124 | detail_list = soup.select("div.details li")
125 | if len(detail_list) == 0:
126 | soup = bs(response.m_response.content, "html5lib")
127 | detail_list = soup.select("div.details li")
128 | mileage = detail_list[0].select("span")[0].text.replace("万公里", "")
129 | first_borad_date = detail_list[1].select("span")[0].text
130 | gear = detail_list[2].select("span")[0].text.split("/")[0]
131 | displacement = detail_list[2].select("span")[0].text.split("/")[1]
132 | price = soup.select("div.car-price ins")[0].text.replace("¥", "")
133 | crawl_date = time.strftime("%Y-%m-%d", time.localtime(time.time()))
134 |
135 | item = dict()
136 | item["car"] = car
137 | item["mileage"] = mileage
138 | item["first_borad_date"] = first_borad_date
139 | item["gear"] = gear
140 | item["displacement"] = displacement
141 | item["price"] = price
142 | item["crawl_date"] = crawl_date
143 |
144 | item["province"] = response.request.meta["province"]
145 | item["city"] = response.request.meta["city"]
146 | item["brand"] = response.request.meta["brand"]
147 | item["cars_line"] = response.request.meta["cars_line"]
148 | yield item
149 |
150 | main.py::
151 |
152 | #!/usr/bin/env python
153 | # -*- coding: utf-8 -*-
154 | from car_processor import Car_Processor
155 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
156 | from sasila.system_normal.spider.spider_core import SpiderCore
157 | from sasila.system_normal.manager import manager
158 | import sasila
159 |
160 | spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline())
161 | manager.set_spider(spider_car)
162 | sasila.start()
163 |
164 | then start your redis and run script::
165 |
166 | python main.py
167 |
168 | then start your spider in your browser::
169 |
170 | http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider
171 |
172 | you can stop spider::
173 |
174 | http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider
175 |
176 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sasila [![PyPI Version]][PyPI] [![Build Status]][Travis CI] [![Coverage Status]][Coverage]
2 |
3 | 现在有很多爬虫框架,比如[**scrapy**](https://github.com/scrapy/scrapy)、[**webmagic**](https://github.com/code4craft/webmagic)、[**pyspider**](https://github.com/binux/pyspider)都可以在爬虫工作中使用,也可以直接通过[**requests**](https://github.com/requests/requests)+[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)来写一些个性化的小型爬虫脚本。但是在实际爬取过程当中,爬虫框架各自有优势和缺陷。比如scrapy,它的功能强大,但过于强大的功能也许反而让新手无所适从,并且它采用twisted异步框架开发,对新手来说源码难以理解,项目难于调试。所以我模仿这些爬虫框架的优势,以尽量简单的原则,搭配gevent(实际上是grequests)开发了这套轻量级爬虫框架。
4 |
5 | 
6 |
7 | * downloader是下载器。
8 | * processor是解析器。
9 | * scheduler是调度器。
10 | * pipeline是数据处理器。
11 | * 将下载器,解析器,调度器,数据处理器注入核心core成为spider对象。
12 | * 通过manager管理spider对象。
13 | * manager透过webapi提供外部访问/控制接口。
14 |
15 | ## **主要特点**
16 |
17 | * 框架代码结构简单易用,易于修改。新手、老鸟皆可把控。
18 | * 采用gevent实现并发操作,与scrapy的twisted相比,代码更容易理解。
19 | * 完全模块化的设计,强大的可扩展性。
20 | * 使用方式和结构参考了[**scrapy**](https://github.com/scrapy/scrapy)和[**webmagic**](https://github.com/code4craft/webmagic)。对有接触过这两个框架的朋友非常友好。
21 | * 不采用命令行来启动爬虫,方便调试。
22 | * 对数据的解析模块并没有集成,可以自由使用[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)、[**lxml**](https://github.com/lxml/lxml)、[**pyquery**](https://github.com/gawel/pyquery)、[**html5lib**](https://github.com/html5lib/html5lib-python)等等各种解析器进行数据抽取。
23 | * 集成代理换IP功能。
24 | * 支持高并发抓取数据。
25 | * 支持分布式。
26 | * 支持增量爬取。
27 | * 支持爬取js动态渲染的页面(加载SeleniumDownLoader即可)。
28 | * 提供webapi对爬虫进行管理、监控。
29 | * 提供即时爬虫的集成思路和结构。
30 |
31 | ## **安装**
32 | ```
33 | pip install sasila
34 | ```
35 | ## **准备**
36 | * 请准备好您的redis服务器进行调度。
37 | * 并在settings.py文件中 写入您的redis服务器地址
38 | ```python
39 | REDIS_HOST = 'localhost'
40 | REDIS_PORT = 6379
41 | ```
42 | ## **构建processor(解析器)**
43 | ```python
44 | #!/usr/bin/env python
45 | # -*- coding: utf-8 -*-
46 | from bs4 import BeautifulSoup as bs
47 | from sasila.system_normal.processor.base_processor import BaseProcessor
48 | from sasila.system_normal.downloader.http.spider_request import Request
49 | from sasila.system_normal.spider.spider_core import SpiderCore
50 |
51 | class Mzi_Processor(BaseProcessor):
52 | spider_id = 'mzi_spider'
53 | spider_name = 'mzi_spider'
54 | allowed_domains = ['mzitu.com']
55 | start_requests = [Request(url='http://www.mzitu.com/', priority=0)]
56 |
57 | @checkResponse
58 | def process(self, response):
59 | soup = bs(response.m_response.content, 'lxml')
60 | print soup.title.string
61 | href_list = soup.select('a')
62 | for href in href_list:
63 | yield Request(url=response.nice_join(href['href']))
64 | ```
65 | **写法与scrapy几乎一样**
66 |
67 | * 所有的解析器都继承自 *BaseProcessor* ,默认入口解析函数为def process(self, response)。
68 | * 为该解析器设置spider_id和spider_name,以及限定域名。
69 | * 初始爬取请求为 *start_requests*,构建Request对象,该对象支持GET、POST方法,支持优先级,设置回调函数等等所有构建request对象的一切属性。默认回调函数为 *process*。
70 | * 可以使用@checkResponse装饰器对返回的 *response* 进行校验并记录异常日志。你也可以定义自己的装饰器。
71 | * 解析函数因为使用 *yield* 关键字,所以是一个生成器。当 *yield* 返回 *Request* 对象,则会将 *Request* 对象推入调度器等待调度继续进行爬取。若 *yield* 不是返回 *Request* 对象则会进入 *pipeline* , *pipeline* 将对数据进行清洗入库等操作。
72 |
73 | **与scrapy相似,sasila同样提供*LinkExtractor的*方式来提取链接,以下是用*LinkExtractor*的方式构造*processor*下载妹子图的示例**
74 |
75 | ```python
76 | #!/usr/bin/env python
77 | # -*- coding: utf-8 -*-
78 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
79 | from sasila.system_normal.downloader.http.spider_request import Request
80 | import os
81 | import uuid
82 |
83 | class MezituProcessor(BaseProcessor):
84 | spider_id = 'mzitu'
85 | spider_name = 'mzitu'
86 | allowed_domains = ['mzitu.com', 'meizitu.net']
87 | start_requests = [Request(url='http://www.mzitu.com/xinggan/')]
88 |
89 | rules = (
90 | Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),callback="save", priority=3),
91 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
92 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2),
93 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0),
94 | )
95 |
96 | def save(self, response):
97 | if response.m_response:
98 | if not os.path.exists("img"):
99 | os.mkdir("img")
100 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
101 | fs.write(response.m_response.content)
102 | print("download success!")
103 | ```
104 |
105 | **LinkExtractor的构造方式为**
106 |
107 | ```python
108 | LinkExtractor(regex_str=None, css_str=None, process_value=None)
109 | ```
110 |
111 | * 提供正则表达式提取方式:*regex_str*
112 | * 提供css选择器提取方式:*css_str*
113 | * 也可以自定义*process_value*来提取链接,其中*process_value*是一个生成器
114 | * 若使用此方式构造*processor*,请不要定义默认入口函数def process(self, response)
115 |
116 |
117 | ## **构建pipeline**
118 | 该pipeline获取数据后将数据转为json格式,并输出到屏幕
119 | ```python
120 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
121 |
122 | class ConsolePipeline(ItemPipeline):
123 | def process_item(self, item):
124 | print json.dumps(item).decode("unicode-escape")
125 | ```
126 | ## **构建spider(爬虫对象)**
127 | * 通过注入 *processor* 生成spider对象
128 | ```python
129 | from sasila.system_normal.spider.spider_core import SpiderCore
130 |
131 | spider = SpiderCore(Mzi_Processor())
132 | ```
133 | * RequestSpider对象包含批下载数量 *batch_size*,下载间隔 *time_sleep*,使用代理 *use_proxy* 等一切必要的属性
134 | ```python
135 | SpiderCore(processor=None, downloader=None, use_proxy=False,scheduler=None,batch_size=None,time_sleep=None)
136 | ```
137 | * 本项目集成使用代理IP的功能,只要在构建RequestSpider时将 *use_proxy* 设置为 *True*,并在脚本同级目录下放置proxy.txt文件即可。你也可以在settings.py文件中写入代理IP文件路径。
138 | ```python
139 | PROXY_PATH_REQUEST = 'proxy/path'
140 | ```
141 | * proxy.txt文件中请写入代理IP,格式为:IP,端口号。若该代理IP有账号密码,在末尾追加账号密码即可。
142 | ```text
143 | 127.0.0.1,8080
144 | 127.0.0.2,8080,user,pwd
145 | 127.0.0.3,8080,user,pwd
146 | ```
147 | * RequestSpider已经默认设置好了 *downloader* 和 *scheduler*,如果不满意,可以自己进行定制。
148 | * 可以为spider设置 *downloader* 和 *pipeline* 甚至 *scheduler*
149 | ```python
150 | spider = spider.set_pipeline(ConsolePipeline())
151 | ```
152 | * 可以通过该方式启动爬虫
153 | ```python
154 | spider.start()
155 | ```
156 | * 也可以将spider注入*manager*进行管理
157 | ```python
158 | from sasila.system_normal.manager import manager
159 | from sasila import system_web
160 |
161 | manager.set_spider(spider)
162 |
163 | system_web.start()
164 | ```
165 |
166 | 访问 http://127.0.0.1:5000/slow_spider/start?spider_id=mzi_spider 来启动爬虫。
167 |
168 | 访问 http://127.0.0.1:5000/slow_spider/stop?spider_id=mzi_spider 来停止爬虫。
169 |
170 | 访问 http://127.0.0.1:5000/slow_spider/detail?spider_id=mzi_spider 来查看爬虫详细信息。
171 |
172 | ## **针对需要登录才能爬取的处理办法**
173 | * 可以为downloader加载登录器(loginer),在使用downloader的时候使用loginer进行登录获取cookies,再进行爬取
174 | * 也可以自己定义一个cookie池,批量进行登录并将登录成功的cookies放进cookie池中随时进行取用。项目中暂时没有这些功能。欢迎pull request~
175 |
176 | ## **架构**
177 | 
178 |
179 | * 任务由 scheduler 发起调度,downloader 抓取网页内容, processor 执行预先编写的py脚本,输出结果或产生新的提链任务(发往 scheduler),形成闭环。
180 | * 每个脚本被认为是一个spider,spiderid确定一个任务。
181 | * downloader
182 | 1. method, header, cookie, proxy,timeout 等等抓取调度控制。
183 | 2. 可以通过适配类似 phantomjs 的webkit引擎支持渲染。
184 | * processor
185 | 1. 灵活运用pyquery,beautifulsoup等解析页面。
186 | 2. 在脚本中完全控制调度抓取的各项参数。
187 | 3. 可以向后链传递信息。
188 | 4. 异常捕获。
189 | * scheduler
190 | 1. 任务优先级。
191 | 2. 对任务进行监控。
192 | 3. 对任务进行去重等操作。
193 | 4. 支持增量。
194 | * webApi
195 | 1. 对爬虫进行增删改查等操作。
196 | * 非及时爬虫流程图
197 |
198 | 
199 |
200 | ## **即时爬虫**
201 | 即时爬虫是可以通过api调用,传入需要爬取的页面或者需求,即时爬取数据并返回结果。现阶段开发并不完善。仅提供思路参考。示例核心代码在 *sasila.system_instant* 中。
202 |
203 | * 即时爬虫-获取数据流程图
204 |
205 | 
206 |
207 | * 即时爬虫-授权流程图
208 |
209 | 
210 |
211 | ## **为啥叫Sasila?**
212 |
213 | 
214 |
215 | 作为一个wower,你可以猜到吗ヾ( ̄▽ ̄)
216 |
217 | ## **fetchman**
218 |
219 | 现提供更好用的爬虫框架[**fetchman**](https://github.com/DarkSand/fetchman),在sasila的基础上做了更多优化和修复并移除web相关功能(个人感觉有点鸡肋)。
220 |
221 |
222 | [Build Status]: https://img.shields.io/travis/DarkSand/Sasila.svg?branch=master&style=flat
223 | [Travis CI]: https://travis-ci.org/DarkSand/Sasila
224 | [Coverage Status]: https://img.shields.io/coveralls/DarkSand/Sasila.svg?branch=master&style=flat
225 | [Coverage]: https://coveralls.io/github/DarkSand/Sasila
226 | [PyPI Version]: https://img.shields.io/pypi/v/Sasila.svg
227 | [PyPI]: https://pypi.python.org/pypi/sasila
228 |
229 |
230 |
231 |
232 |
233 |
--------------------------------------------------------------------------------
/pic/feijishi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/feijishi.png
--------------------------------------------------------------------------------
/pic/jiagou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jiagou.png
--------------------------------------------------------------------------------
/pic/jichu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jichu.png
--------------------------------------------------------------------------------
/pic/jigou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jigou.png
--------------------------------------------------------------------------------
/pic/jishi-huoqushuju.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-huoqushuju.png
--------------------------------------------------------------------------------
/pic/jishi-shouquan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-shouquan.png
--------------------------------------------------------------------------------
/pic/spider.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/spider.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==0.11.1
2 | redis==2.10.5
3 | requests==2.13.0
4 | six==1.10.0
5 | SQLAlchemy==1.1.4
6 | grequests==0.3.0
7 | selenium==2.53.6
8 | lxml==3.7.2
9 | beautifulsoup4==4.6.0
10 |
--------------------------------------------------------------------------------
/sasila-example/car_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from sasila.system_normal.spider.spider_core import SpiderCore
4 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
5 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar
6 | from sasila.system_normal.processor.base_processor import BaseProcessor
7 | from sasila.system_normal.downloader.http.spider_request import Request
8 | from sasila.system_normal.utils.decorator import checkResponse
9 |
10 | from bs4 import BeautifulSoup as bs
11 | import json
12 | import time
13 | import sys
14 |
15 | if sys.version_info < (3, 0):
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | class Car_Processor(BaseProcessor):
21 | spider_id = 'car_spider'
22 | spider_name = 'car_spider'
23 | allowed_domains = ['che168.com']
24 | start_requests = [Request(url='http://www.che168.com', priority=0)]
25 |
26 | @checkResponse
27 | def process(self, response):
28 | soup = bs(response.m_response.content, 'lxml')
29 | province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear')
30 | for province_div in province_div_list:
31 | province_name = province_div.select('span.capital a')[0].text
32 | city_list = province_div.select('div.city a')
33 | for city in city_list:
34 | city_name = city.text
35 | pinyin = city['href'].strip('/').split('/')[0]
36 | request = Request(
37 | url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin,
38 | priority=1, callback=self.process_page_1)
39 | request.meta['province'] = province_name
40 | request.meta['city'] = city_name
41 | yield request
42 |
43 | @checkResponse
44 | def process_page_1(self, response):
45 | brand_list = list(json.loads(response.m_response.content.decode('gb2312')))
46 | for brand in brand_list:
47 | brand_dict = dict(brand)
48 | brand_name = brand_dict['name']
49 | url = response.nice_join(brand_dict['url']) + '/'
50 | request = Request(url=url, priority=2, callback=self.process_page_2)
51 | request.meta['province'] = response.request.meta['province']
52 | request.meta['city'] = response.request.meta['city']
53 | request.meta['brand'] = brand_name
54 | yield request
55 |
56 | @checkResponse
57 | def process_page_2(self, response):
58 | soup = bs(response.m_response.content, 'lxml')
59 | cars_line_list = soup.select('div#series div.content-area dl.model-list dd a')
60 | for cars_line in cars_line_list:
61 | cars_line_name = cars_line.text
62 | url = 'http://www.che168.com' + cars_line['href']
63 | request = Request(url=url, priority=3, callback=self.process_page_3)
64 | request.meta['province'] = response.request.meta['province']
65 | request.meta['city'] = response.request.meta['city']
66 | request.meta['brand'] = response.request.meta['brand']
67 | request.meta['cars_line'] = cars_line_name
68 | yield request
69 |
70 | @checkResponse
71 | def process_page_3(self, response):
72 | soup = bs(response.m_response.content, 'lxml')
73 | car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
74 | for car_info in car_info_list:
75 | url = 'http://www.che168.com' + car_info['href']
76 | request = Request(url=url, priority=4, callback=self.process_page_4)
77 | request.meta['province'] = response.request.meta['province']
78 | request.meta['city'] = response.request.meta['city']
79 | request.meta['brand'] = response.request.meta['brand']
80 | request.meta['cars_line'] = response.request.meta['cars_line']
81 | yield request
82 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
83 | if next_page:
84 | url = 'http://www.che168.com' + next_page['href']
85 | request = Request(url=url, priority=3, callback=self.process_page_3)
86 | request.meta['province'] = response.request.meta['province']
87 | request.meta['city'] = response.request.meta['city']
88 | request.meta['brand'] = response.request.meta['brand']
89 | request.meta['cars_line'] = response.request.meta['cars_line']
90 | yield request
91 |
92 | @checkResponse
93 | def process_page_4(self, response):
94 | soup = bs(response.m_response.content.decode('gb2312', 'ignore'), 'lxml')
95 | # Object moved
96 | # Object moved to here.
97 | #
98 | if len(soup.select('div.car-title h2')) != 0:
99 | car = soup.select('div.car-title h2')[0].text
100 | detail_list = soup.select('div.details li')
101 | if len(detail_list) == 0:
102 | soup = bs(response.m_response.content, 'html5lib')
103 | detail_list = soup.select('div.details li')
104 | mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
105 | first_borad_date = detail_list[1].select('span')[0].text
106 | gear = detail_list[2].select('span')[0].text.split('/')[0]
107 | displacement = detail_list[2].select('span')[0].text.split('/')[1]
108 | price = soup.select('div.car-price ins')[0].text.replace('¥', '')
109 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
110 |
111 | item = dict()
112 | item['car'] = car
113 | item['mileage'] = mileage
114 | item['first_borad_date'] = first_borad_date
115 | item['gear'] = gear
116 | item['displacement'] = displacement
117 | item['price'] = price
118 | item['crawl_date'] = crawl_date
119 |
120 | item['province'] = response.request.meta['province']
121 | item['city'] = response.request.meta['city']
122 | item['brand'] = response.request.meta['brand']
123 | item['cars_line'] = response.request.meta['cars_line']
124 | yield item
125 |
126 |
127 | if __name__ == '__main__':
128 | SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
129 |
--------------------------------------------------------------------------------
/sasila-example/fang_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang
9 |
10 | from sasila.system_normal.processor.base_processor import BaseProcessor
11 | from sasila.system_normal.downloader.http.spider_request import Request
12 | import time
13 | from sasila.system_normal.utils.decorator import checkResponse
14 |
15 | if sys.version_info < (3, 0):
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | class Fang_Processor(BaseProcessor):
21 | spider_id = 'fang_spider'
22 | spider_name = 'fang_spider'
23 | allowed_domains = ['fang.com']
24 | start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)]
25 |
26 | @checkResponse
27 | def process(self, response):
28 | soup = bs(response.m_response.content, 'lxml')
29 | province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'}
30 | province_div_list = soup.select('div#c02 ul li')
31 | for province_div in province_div_list:
32 | province_name = province_div.select('strong')[0].text
33 | if province_name != '其他':
34 | if province_name in province_list:
35 | city_list = province_div.select('a')
36 | for city in city_list:
37 | city_name = city.text
38 | url = city['href']
39 | request = Request(url=url, priority=1, callback=self.process_page_1)
40 | request.meta['province'] = province_name
41 | request.meta['city'] = city_name
42 | yield request
43 |
44 | @checkResponse
45 | def process_page_1(self, response):
46 | soup = bs(response.m_response.content, 'lxml')
47 | district_list = soup.select('div.qxName a')
48 | district_list.pop(0)
49 | for district in district_list:
50 | district_name = district.text
51 | url = response.request.url + district['href']
52 | request = Request(url=url, priority=2, callback=self.process_page_2)
53 | request.meta['province'] = response.request.meta['province']
54 | request.meta['city'] = response.request.meta['city']
55 | request.meta['district'] = district_name
56 | yield request
57 |
58 | @checkResponse
59 | def process_page_2(self, response):
60 | soup = bs(response.m_response.content, 'lxml')
61 | avg_price_list = soup.select('div.newcardR dl')
62 | if len(avg_price_list) > 0:
63 | avg_price = avg_price_list[1].select('dd b')[0].text
64 | else:
65 | avg_price = '未知'
66 | detail_list = soup.select('div.houseList dl')
67 | for detail in detail_list:
68 | if len(detail.select('p.mt10 a span')) != 0:
69 | estate = detail.select('p.mt10 a span')[0].text
70 | area = detail.select('div.area p')[0].text.replace('㎡', '')
71 | layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
72 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
73 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
74 | item = dict()
75 | item['avg_price'] = avg_price
76 | item['estate'] = estate
77 | item['area'] = area
78 | item['layout'] = layout
79 | item['total_price'] = total_price
80 | item['crawl_date'] = crawl_date
81 |
82 | item['province'] = response.request.meta['province']
83 | item['city'] = response.request.meta['city']
84 | item['district'] = response.request.meta['district']
85 | item['url'] = response.request.url
86 | yield item
87 |
88 | next_page = soup.select('a#PageControl1_hlk_next')
89 | if len(next_page) > 0:
90 | url = response.nice_join(next_page[0]['href'])
91 | request = Request(url=url, priority=2, callback=self.process_page_2)
92 | request.meta['province'] = response.request.meta['province']
93 | request.meta['city'] = response.request.meta['city']
94 | request.meta['district'] = response.request.meta['district']
95 | yield request
96 |
97 |
98 | if __name__ == '__main__':
99 | spider = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
100 |
--------------------------------------------------------------------------------
/sasila-example/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import os
5 |
6 | sys.path.append(os.getcwd())
7 |
8 | from car_processor import Car_Processor
9 | from fang_processor import Fang_Processor
10 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
11 | from sasila.system_normal.spider.spider_core import SpiderCore
12 | from sasila.system_normal.manager import manager
13 | from sasila import system_web
14 |
15 | if __name__ == '__main__':
16 | spider_car = SpiderCore(Car_Processor(),batch_size=100).set_pipeline(ConsolePipeline())
17 | spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline())
18 | manager.set_spider(spider_car)
19 | manager.set_spider(spider_fang)
20 | system_web.start()
21 |
--------------------------------------------------------------------------------
/sasila-example/proxy.txt:
--------------------------------------------------------------------------------
1 | 127.0.0.1,8888
--------------------------------------------------------------------------------
/sasila-example/settings.py:
--------------------------------------------------------------------------------
1 | # settings
2 |
3 | # phantomjs'useragent
4 | # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
5 |
6 | # phantomjs'path
7 | # PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe'
8 |
9 | # phantomjs'service
10 | # PHANTOMJS_SERVICE = [
11 | # '--proxy=localhost:8888',
12 | # '--proxy-type=http',
13 | # # '--proxy-auth=username:password'
14 | # ]
15 |
16 | # phatomjs'pool size
17 | # DRIVER_POOL_SIZE = 5
18 |
19 | # proxy'path
20 | # PROXY_PATH_REQUEST = 'proxy/path'
21 |
22 | # redis host
23 | # REDIS_HOST = 'localhost'
24 |
25 | # redis port
26 | REDIS_PORT = 6379
27 |
--------------------------------------------------------------------------------
/sasila/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sasila.settings
4 |
5 | __version__ = '0.0.26'
6 |
--------------------------------------------------------------------------------
/sasila/settings/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import imp
4 | import sys
5 | import os
6 |
7 | if sys.version_info < (3, 0):
8 | reload(sys)
9 | sys.setdefaultencoding('utf-8')
10 |
11 | import sasila.settings.default_settings
12 |
13 | setting_path = os.path.join(os.getcwd(), 'settings.py')
14 |
15 | # 如果运行目录存在settings.py文件,则对默认设置进行覆写
16 | if os.path.exists(setting_path):
17 | new_settings = imp.load_source('settings', setting_path)
18 |
19 | new_settings_dict = dict()
20 | for key in dir(new_settings):
21 | if key.isupper():
22 | new_settings_dict[key] = getattr(new_settings, key)
23 | if sys.version_info < (3, 0):
24 | for key, value in new_settings_dict.iteritems():
25 | setattr(default_settings, key, value)
26 | else:
27 | for key, value in new_settings_dict.items():
28 | setattr(default_settings, key, value)
--------------------------------------------------------------------------------
/sasila/settings/default_settings.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import os
5 |
6 | if sys.version_info < (3, 0):
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 | BASE_DIR = os.getcwd()
11 |
12 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
13 |
14 | PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe'
15 |
16 | # PHANTOMJS_SERVICE = [
17 | # '--proxy=localhost:8888',
18 | # '--proxy-type=http',
19 | # # '--proxy-auth=username:password'
20 | # ]
21 |
22 | PHANTOMJS_SERVICE = None
23 |
24 | DRIVER_POOL_SIZE = 5
25 |
26 | PROXY_PATH_REQUEST = os.path.join(BASE_DIR, 'proxy.txt')
27 |
28 | REDIS_HOST = 'localhost'
29 |
30 | REDIS_PORT = 6379
31 |
32 |
--------------------------------------------------------------------------------
/sasila/system_instant/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/blueprints/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/blueprints/jd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import json
5 | from flask import Blueprint
6 | from flask import request
7 | from sasila.system_instant.manager.jd_manager import JdManager
8 |
9 | if sys.version_info < (3, 0):
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 | im_jd = Blueprint('im_jd', __name__)
14 |
15 | jd_manager = JdManager()
16 |
17 |
18 | @im_jd.route('/login')
19 | def login():
20 | return jd_manager.login(request.args['collect_token'], request.args['account'], request.args['password'])
21 |
22 |
23 | @im_jd.route('/qrlogin')
24 | def qr_login():
25 | message = jd_manager.qrlogin(request.args['collect_token'])
26 | # result = '
' + message
27 | # return result
28 | return message
29 |
30 |
31 | @im_jd.route('/submit_qrlogin')
32 | def submit_qrlogin():
33 | return jd_manager.submit_qrlogin(request.args['collect_token'])
34 |
--------------------------------------------------------------------------------
/sasila/system_instant/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/crawler/jd/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/crawler/jd/request.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import json
4 | import sys
5 | import time
6 | import requests
7 | from bs4 import BeautifulSoup as bs
8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool
9 | from sasila.system_normal.utils.cookie import formart_selenium_cookies
10 | from sasila.system_normal.utils import logger
11 | from sasila.system_normal.utils import jd_code
12 |
13 | if sys.version_info < (3, 0):
14 | reload(sys)
15 | sys.setdefaultencoding('utf-8')
16 |
17 |
18 | def abstract(text, start, end):
19 | if text is None or text == '':
20 | return ''
21 | res = ''
22 | if start is not None and start != '':
23 | if start not in text:
24 | return res
25 | else:
26 | text = text[text.index(start) + len(start):]
27 | if end is not None and end != '':
28 | if end not in text:
29 | return res
30 | else:
31 | res = text[0:text.index(end)]
32 | else:
33 | res = text
34 | return res
35 |
36 |
37 | class JdMessage(object):
38 | def __init__(self):
39 | self.code = ""
40 | self.code_description = ""
41 | self.cookies = ""
42 | self.qr_captcha = ""
43 |
44 |
45 | class JdRequest(object):
46 | def __init__(self):
47 | self.web_driver_pool = None # type: Queue
48 |
49 | def init_pool(self):
50 | logger.info('init web driver pool...')
51 | self.web_driver_pool = get_web_driver_pool(1)
52 | logger.info('init web driver pool success...')
53 |
54 | def login(self, account, password):
55 | message = JdMessage()
56 |
57 | web = self.web_driver_pool.get() # type: webdriver.PhantomJS
58 | web.delete_all_cookies()
59 |
60 | web.get("https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F")
61 | element = web.find_element_by_css_selector("div.login-tab.login-tab-r").find_element_by_css_selector("a")
62 | element.click()
63 | element = web.find_element_by_id("loginname")
64 | element.clear()
65 | element.send_keys(account)
66 | element = web.find_element_by_id("nloginpwd")
67 | element.clear()
68 | element.send_keys(password)
69 | element = web.find_element_by_css_selector("a#loginsubmit")
70 | element.click()
71 | time.sleep(3)
72 |
73 | if '我的京东' in bs(web.execute_script("return document.documentElement.outerHTML"), 'lxml').title.string:
74 | message.code = jd_code.SUCCESS
75 | message.code_description = "登录成功"
76 | message.cookies = formart_selenium_cookies(web.get_cookies())
77 | else:
78 | # 需要手机验证码等等状况
79 | pass
80 |
81 | self.web_driver_pool.put(web)
82 | return message
83 |
84 | def qr_login(self):
85 | message = JdMessage()
86 | headers = dict()
87 | headers[
88 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
89 | headers["Accept"] = "*/*"
90 | headers["Accept-Encoding"] = "gzip, deflate"
91 | headers["Accept-Language"] = "zh-CN,en,*"
92 | headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F"
93 | session = requests.Session()
94 | response = session.get("https://qr.m.jd.com/show?appid=133&size=147&t=" + str(time.time()))
95 |
96 | message.code = jd_code.SUCCESS
97 | message.qr_captcha = response.content.encode("base64")
98 | message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape")
99 | return message
100 |
101 | def submit_qrlogin(self, cookies):
102 | message = JdMessage()
103 |
104 | headers = dict()
105 | headers[
106 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
107 | headers["Accept"] = "*/*"
108 | headers["Accept-Encoding"] = "gzip, deflate"
109 | headers["Accept-Language"] = "zh-CN,en,*"
110 | headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F"
111 | session = requests.Session()
112 |
113 | response = session.get("https://qr.m.jd.com/check?callback=jQuery6172296&appid=133&_=1486609849337",
114 | cookies=json.loads(cookies),
115 | headers=headers)
116 |
117 | ticket = abstract(response.content, '\"ticket\" : \"', '\"')
118 |
119 | headers['X-Requested-With'] = 'XMLHttpRequest'
120 | response = session.get("https://passport.jd.com/uc/qrCodeTicketValidation?t=" + ticket, headers=headers)
121 |
122 | message.code = jd_code.SUCCESS
123 | message.code_description = "登录成功"
124 | message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape")
125 |
126 | return message
127 |
--------------------------------------------------------------------------------
/sasila/system_instant/database/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/database/jd_database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from sqlalchemy import Column, Integer, String, DateTime, create_engine
5 | from sqlalchemy.orm import sessionmaker
6 | from sqlalchemy.ext.declarative import declarative_base
7 |
8 | if sys.version_info < (3, 0):
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 |
12 | # 创建对象的基类:
13 | Base = declarative_base()
14 |
15 |
16 | class Process(Base):
17 | # 表的名字:
18 | __tablename__ = 'crawler_flow_info'
19 | # 表的结构:
20 | collect_token = Column(String(100), primary_key=True)
21 | customer_id = Column(String(100))
22 | token_valid_time = Column(Integer)
23 | token_create_time = Column(Integer)
24 | status = Column(String(10))
25 | cookies = Column(String(5000))
26 |
27 |
28 | class JdDatabase(object):
29 | def __init__(self):
30 | # 初始化数据库连接:
31 | self.engine = create_engine('mysql+mysqlconnector://root:root@192.168.3.210:3306/hiveengine')
32 | # 创建DBSession类型:
33 | self.DBSession = sessionmaker(bind=self.engine)
34 | self._create_all()
35 |
36 | def _create_all(self):
37 | '''
38 | 创建从Base派生的所有表,如果数据表存在则忽视
39 | :return:
40 | '''
41 | Base.metadata.create_all(self.engine)
42 |
43 | def _drop_all(self):
44 | '''
45 | 删除DB中所有的表
46 | :return:
47 | '''
48 | Base.metadata.drop_all(self.engine)
49 |
50 | def create_session(self):
51 | return self.DBSession()
52 |
53 | def query_cookie(self, collect_token):
54 | session = self.DBSession()
55 | cookies = session.query(Process).filter(Process.collect_token == collect_token).first().cookies
56 | session.close()
57 | return cookies
58 |
59 | def update_cookie(self, collect_token, cookies):
60 | session = self.DBSession()
61 | session.query(Process).filter(Process.collect_token == collect_token).update({
62 | Process.cookies: cookies
63 | })
64 | session.close()
65 |
--------------------------------------------------------------------------------
/sasila/system_instant/manager/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_instant/manager/jd_manager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import datetime
4 | from sasila.system_normal.utils import jd_code
5 | import json
6 | from sasila.system_instant.crawler.jd.request import JdRequest
7 | from sasila.system_instant.database.jd_database import *
8 |
9 | if sys.version_info < (3, 0):
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 |
14 | class JdResponse(object):
15 | def __init__(self, code, code_description, qr_captcha=None):
16 | self.code = code
17 | self.code_description = code_description
18 | self.qr_captcha = qr_captcha
19 |
20 |
21 | class JdManager(object):
22 | def __init__(self):
23 | self.database = JdDatabase()
24 | self.request = JdRequest()
25 | self.request.init_pool()
26 |
27 | def login(self, collect_token, account, password):
28 | message = self.request.login(account, password)
29 | if message.code == jd_code.SUCCESS:
30 | self.database.update_cookie(collect_token, message.cookies)
31 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode(
32 | 'unicode-escape')
33 |
34 | def qrlogin(self, collect_token):
35 | message = self.request.qr_login()
36 | if message.code == jd_code.SUCCESS:
37 | self.database.update_cookie(collect_token, message.cookies)
38 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description,
39 | qr_captcha=message.qr_captcha).__dict__).decode(
40 | 'unicode-escape')
41 |
42 | def submit_qrlogin(self, collect_token):
43 | cookies = self.database.query_cookie(collect_token)
44 | message = self.request.submit_qrlogin(cookies)
45 | if message.code == jd_code.SUCCESS:
46 | self.database.update_cookie(collect_token, message.cookies)
47 | return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode(
48 | 'unicode-escape')
49 |
--------------------------------------------------------------------------------
/sasila/system_instant/settings.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/blueprints/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/blueprints/slow_spiders.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from flask import Blueprint, request
5 | from sasila.system_normal.manager import manager
6 | import json
7 |
8 | if sys.version_info < (3, 0):
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 |
12 | slow_spider = Blueprint('slow_spider', __name__)
13 |
14 |
15 | @slow_spider.route('/all')
16 | def get_all_spider():
17 | return json.dumps(manager.get_all_spider())
18 |
19 |
20 | @slow_spider.route('/find')
21 | def find_spider(spider_id):
22 | return json.dumps(manager.find_spider(spider_id))
23 |
24 |
25 | @slow_spider.route('/start')
26 | def start_spider():
27 | spider_id = request.args['spider_id']
28 | manager.start_spider(spider_id)
29 | return 'start success:' + spider_id
30 |
31 |
32 | @slow_spider.route('/restart')
33 | def restart_spider():
34 | spider_id = request.args['spider_id']
35 | manager.stop_spider(spider_id)
36 | manager.restart_spider(spider_id)
37 | return 'restart success:' + spider_id
38 |
39 |
40 | @slow_spider.route('/stop')
41 | def stop_spider():
42 | spider_id = request.args['spider_id']
43 | manager.stop_spider(request.args['spider_id'])
44 | return 'stop success:' + spider_id
45 |
46 |
47 | @slow_spider.route('/detail')
48 | def get_spider_detail():
49 | return manager.get_spider_detail(request.args['spider_id'])
50 |
51 |
52 | @slow_spider.route('/init')
53 | def init_system():
54 | return json.dumps(manager.init_system())
55 |
--------------------------------------------------------------------------------
/sasila/system_normal/database/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/base_downloder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | class BaseDownLoader(object):
11 | def __init__(self):
12 | self.loginer = None
13 |
14 | def download(self, request):
15 | pass
16 |
17 | def set_loginer(self, loginer):
18 | self.loginer = loginer
19 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/selenium_response.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import re
5 | from posixpath import normpath
6 |
7 | if sys.version_info < (3, 0):
8 | reload(sys)
9 | sys.setdefaultencoding('utf-8')
10 | from urlparse import urljoin, urlparse, urlunparse
11 | else:
12 | from urllib.parse import urljoin, urlparse, urlunparse
13 |
14 |
15 | class SeleniumResponse(object):
16 | def __init__(self, m_response=None, request=None):
17 | self.request = request
18 | self.m_response = m_response
19 |
20 | def __str__(self):
21 | if self.m_response:
22 | return "" % (self.request.url, (float(len(self.m_response.content)) / 1000))
23 | else:
24 | return "" % self.request.url
25 |
26 | def nice_join(self, url):
27 | url1 = urljoin(self.request.url, url)
28 | arr = urlparse(url1)
29 | path = normpath(arr[2])
30 | return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
31 |
32 | def is_url(self, url):
33 | if re.match(r'^https?:/{2}\w.+$', url):
34 | return True
35 | else:
36 | return False
37 |
38 | __repr__ = __str__
39 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/spider_request.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | class Request(object):
11 | def __init__(self, url=None, data=None, json=None, headers=None, method="GET", cookies=None, meta=None,
12 | callback=None,
13 | errback=None, priority=0, allow_redirects=True, timeout=5, duplicate_remove=True):
14 | self.url = url
15 | self.data = data
16 | self.json = json
17 | self.headers = headers
18 | self.method = method
19 | self.allow_redirects = allow_redirects
20 | if not meta:
21 | self.meta = {}
22 | else:
23 | self.meta = meta
24 | self.cookies = cookies
25 | self.callback = callback
26 | self.priority = priority
27 | self.duplicate_remove = duplicate_remove
28 | self.timeout = timeout
29 | self.errback = errback
30 |
31 | def __str__(self):
32 | return "" % (self.method, self.url)
33 |
34 | __repr__ = __str__
35 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/spider_response.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import re
5 | from posixpath import normpath
6 |
7 | from requests.models import Response as Response_name
8 |
9 | if sys.version_info < (3, 0):
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 | from urlparse import urljoin, urlparse, urlunparse
13 | else:
14 | from urllib.parse import urljoin, urlparse, urlunparse
15 |
16 |
17 | class Response(object):
18 | def __init__(self, m_response=None, request=None):
19 | self.request = request
20 | self.m_response = m_response
21 |
22 | def __str__(self):
23 | if isinstance(self.m_response, Response_name):
24 | if self.m_response:
25 | return "" % (
26 | self.m_response.status_code, self.m_response.url, (float(len(self.m_response.content)) / 1000))
27 | else:
28 | return "" % self.request.url
29 | else:
30 | return "" % self.request.url
31 |
32 | def nice_join(self, url):
33 | url1 = urljoin(self.request.url, url)
34 | arr = urlparse(url1)
35 | path = normpath(arr[2])
36 | return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
37 |
38 | def is_url(self, url):
39 | if re.match(r'^https?:/{2}\w.+$', url):
40 | return True
41 | else:
42 | return False
43 |
44 | __repr__ = __str__
45 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/proxy/proxy_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.settings.default_settings import PROXY_PATH_REQUEST
6 |
7 | if sys.version_info < (3, 0):
8 | import Queue
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 | else:
12 | from queue import Queue
13 |
14 |
15 | class ProxyPool(object):
16 | def __init__(self):
17 | self.queue = Queue.Queue()
18 | with open(PROXY_PATH_REQUEST, 'r') as f:
19 | lines = f.readlines()
20 | self.len = len(lines)
21 | for line in lines:
22 | info = line.strip().split(',')
23 | proxy = {}
24 | if len(info) == 2:
25 | proxy = {"http": "http://%s:%s" % (info[0], info[1]),
26 | "https": "http://%s:%s" % (info[0], info[1])}
27 | elif len(info) == 4:
28 | proxy = {"http": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1]),
29 | "https": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1])}
30 | self.queue.put(proxy)
31 |
32 | def __len__(self):
33 | return self.len
34 |
35 | def getProxy(self):
36 | proxy = self.queue.get()
37 | self.queue.put(proxy)
38 | return proxy
39 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/requests_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import grequests
5 | import requests
6 | from requests.adapters import HTTPAdapter
7 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader
8 | from sasila.system_normal.downloader.http.spider_response import Response
9 | from sasila.system_normal.downloader.proxy.proxy_pool import ProxyPool
10 |
11 | from sasila.system_normal.utils import logger
12 |
13 | if sys.version_info < (3, 0):
14 | reload(sys)
15 | sys.setdefaultencoding('utf-8')
16 |
17 |
18 | class RequestsDownLoader(BaseDownLoader):
19 | # proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",}
20 |
21 | def __init__(self, loginer=None, use_proxy=False):
22 | self.loginer = loginer
23 | self.use_proxy = use_proxy
24 | if use_proxy:
25 | self.proxy_pool = ProxyPool()
26 | if len(self.proxy_pool) == 0:
27 | self.use_proxy = False
28 | self._cookies = None
29 |
30 | self._headers = dict()
31 | self._headers[
32 | "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
33 | self._headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
34 | self._headers["Accept-Encoding"] = "gzip, deflate, sdch"
35 | self._headers["Accept-Language"] = "zh-CN,zh;q=0.8"
36 | self._request_retry = HTTPAdapter(max_retries=3)
37 |
38 | cookie_dict = dict()
39 | self._cookies = cookie_dict
40 |
41 | def init_loginer(self, account, password):
42 | self._cookies = self.loginer.logint(account, password)
43 |
44 | def download(self, batch):
45 | batch_requests = []
46 |
47 | for request in batch:
48 | session = requests.session()
49 | session.mount('https://', self._request_retry)
50 | session.mount('http://', self._request_retry)
51 |
52 | if not request.headers:
53 | request.headers = self._headers
54 | session.headers = self._headers
55 |
56 | if request.method.upper() == "GET":
57 | if self.use_proxy:
58 | m_proxies = self.proxy_pool.getProxy()
59 | batch_requests.append(grequests.get(
60 | session=session,
61 | url=request.url,
62 | headers=request.headers,
63 | cookies=self._cookies,
64 | verify=False,
65 | allow_redirects=request.allow_redirects,
66 | timeout=request.timeout,
67 | proxies=m_proxies
68 | ))
69 | else:
70 | batch_requests.append(grequests.get(
71 | session=session,
72 | url=request.url,
73 | headers=request.headers,
74 | cookies=self._cookies,
75 | verify=False,
76 | allow_redirects=request.allow_redirects,
77 | timeout=request.timeout
78 | ))
79 | elif request.method.upper() == "POST":
80 | if self.use_proxy:
81 | m_proxies = self.proxy_pool.getProxy()
82 | batch_requests.append(grequests.post(
83 | session=session,
84 | url=request.url,
85 | data=request.data,
86 | json=request.json,
87 | headers=request.headers,
88 | cookies=self._cookies,
89 | verify=False,
90 | allow_redirects=request.allow_redirects,
91 | timeout=request.timeout,
92 | proxies=m_proxies
93 | ))
94 | else:
95 | batch_requests.append(grequests.post(
96 | session=session,
97 | url=request.url,
98 | data=request.data,
99 | json=request.json,
100 | headers=request.headers,
101 | cookies=self._cookies,
102 | verify=False,
103 | allow_redirects=request.allow_redirects,
104 | timeout=request.timeout
105 | ))
106 | else:
107 | pass
108 |
109 | rets = grequests.map(batch_requests, exception_handler=exception_handler)
110 |
111 | true_responses = []
112 | index = 0
113 | for ret in rets:
114 | true_response = Response(
115 | m_response=ret,
116 | request=batch[index],
117 | )
118 | true_responses.append(true_response)
119 | logger.info(true_response)
120 | index += 1
121 |
122 | return true_responses
123 |
124 |
125 | def exception_handler(request, exception):
126 | logger.error("%s %s" % (request.url, exception))
127 |
128 |
129 | if __name__ == "__main__":
130 | proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",}
131 | requests.post(url="http://www.jd.com", data={"123": "fdsgs"})
132 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/selenium_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.settings import default_settings
6 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader
7 | from sasila.system_normal.downloader.http.selenium_response import SeleniumResponse
8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool
9 | from sasila.system_normal.utils import logger
10 | from multiprocessing.pool import ThreadPool as Pool
11 |
12 | if sys.version_info < (3, 0):
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 |
17 | class SeleniumDownLoader(BaseDownLoader):
18 | def __init__(self, driver_pool_size=None):
19 | self.driver_pool_size = driver_pool_size
20 | logger.info("init web driver pool...")
21 | if driver_pool_size:
22 | self.web_driver_pool = get_web_driver_pool(driver_pool_size)
23 | else:
24 | self.web_driver_pool = get_web_driver_pool(default_settings.DRIVER_POOL_SIZE)
25 | logger.info("init web driver pool success")
26 |
27 | def download_one(self, request):
28 | web = self.web_driver_pool.get() # type:WebDriver
29 | web.get(request.url)
30 | m_response = m_object()
31 | m_response.content = web.execute_script("return document.documentElement.outerHTML")
32 | response = SeleniumResponse(m_response=m_response, request=request)
33 | self.web_driver_pool.put(web)
34 | return response
35 |
36 | def download(self, batch):
37 | if self.driver_pool_size:
38 | pool = Pool(processes=self.driver_pool_size)
39 | else:
40 | pool = Pool(processes=default_settings.DRIVER_POOL_SIZE)
41 |
42 | results = []
43 |
44 | for request in batch:
45 | results.append(pool.apply_async(self.download_one, (request,)))
46 | pool.close()
47 | pool.join()
48 |
49 | true_responses = []
50 | for result in results:
51 | true_response = result.get()
52 | true_responses.append(true_response)
53 | logger.info(true_response)
54 |
55 | return true_responses
56 |
57 |
58 | class m_object(object):
59 | pass
60 |
--------------------------------------------------------------------------------
/sasila/system_normal/downloader/web_driver_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 |
6 | from selenium import webdriver
7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
8 |
9 | from sasila.settings import default_settings
10 |
11 | if sys.version_info < (3, 0):
12 | import Queue
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 | else:
16 | from queue import Queue
17 |
18 | dcap = dict(DesiredCapabilities.PHANTOMJS)
19 | dcap["phantomjs.page.settings.resourceTimeout"] = 10
20 | dcap["phantomjs.page.settings.loadImages"] = True
21 | dcap["phantomjs.page.settings.userAgent"] = default_settings.USER_AGENT
22 |
23 |
24 | def _get_base_driver():
25 | if default_settings.PHANTOMJS_SERVICE:
26 | web = webdriver.PhantomJS(service_args=default_settings.PHANTOMJS_SERVICE, executable_path=default_settings.PHANTOMJS_PATH
27 | , desired_capabilities=dcap)
28 | else:
29 | web = webdriver.PhantomJS(executable_path=default_settings.PHANTOMJS_PATH
30 | , desired_capabilities=dcap)
31 | return web
32 |
33 |
34 | def get_web_driver_pool(num):
35 | driver_queue = Queue.Queue()
36 | i = 0
37 | while i < num:
38 | web = _get_base_driver()
39 | driver_queue.put(web)
40 | i += 1
41 | return driver_queue
42 |
--------------------------------------------------------------------------------
/sasila/system_normal/loginer/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/loginer/base_loginer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | class BaseLoginer(object):
11 | def login(self, account, password):
12 | cookies = ""
13 | return cookies
14 |
--------------------------------------------------------------------------------
/sasila/system_normal/loginer/jd_loginer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/manager/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from sasila.system_normal.manager.spider_manager import SpiderManager
4 | import sys
5 |
6 | if sys.version_info < (3, 0):
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 | manager = SpiderManager()
11 |
--------------------------------------------------------------------------------
/sasila/system_normal/manager/spider_manager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import json
5 | import threading
6 |
7 | if sys.version_info < (3, 0):
8 | reload(sys)
9 | sys.setdefaultencoding('utf-8')
10 |
11 |
12 | class SpiderManager(object):
13 | def __init__(self):
14 | self.spider_list = dict()
15 |
16 | def set_spider(self, spider):
17 | self.spider_list[spider._spider_id] = spider
18 |
19 | def del_spider(self, spider_id):
20 | if spider_id in self.spider_list.keys():
21 | self.spider_list[spider_id].stop()
22 | del self.spider_list[spider_id]
23 |
24 | def init_system(self):
25 | pass
26 |
27 | def get_all_spider(self):
28 | return json.dumps(self.spider_list.keys())
29 |
30 | def find_spider(self, spider_id):
31 | pass
32 |
33 | def start_spider(self, spider_id):
34 | if self.spider_list[spider_id]._spider_status == "stopped":
35 | thread = threading.Thread(target=self.spider_list[spider_id].start)
36 | thread.setDaemon(True)
37 | thread.start()
38 |
39 | def restart_spider(self, spider_id):
40 | thread = threading.Thread(target=self.spider_list[spider_id].restart)
41 | thread.setDaemon(True)
42 | thread.start()
43 |
44 | def stop_spider(self, spider_id):
45 | self.spider_list[spider_id].stop()
46 |
47 | def get_spider_detail(self, spider_id):
48 | return str(self.spider_list[spider_id]._process_count)
49 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/base_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | class ItemPipeline(object):
11 | def process_item(self, item):
12 | raise NotImplementedError
13 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/console_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
5 | import json
6 |
7 | if sys.version_info < (3, 0):
8 | reload(sys)
9 | sys.setdefaultencoding('utf-8')
10 |
11 |
12 | class ConsolePipeline(ItemPipeline):
13 | def process_item(self, item):
14 | if sys.version_info < (3, 0):
15 | print(json.dumps(item).decode("unicode-escape"))
16 | else:
17 | print(json.dumps(item).encode('utf8').decode("unicode-escape"))
18 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/kafa_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # import sys
4 | # from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
5 | # import json
6 | # from sasila.system_normal.utils.kafka_utils import send_message
7 | #
8 | # reload(sys)
9 | # sys.setdefaultencoding('utf-8')
10 | #
11 | #
12 | # class KafkaPipeline(ItemPipeline):
13 | # def process_item(self, item):
14 | # send_message("dataCollectionTopic", bytes("CompanyConsummer__" + json.dumps(item).decode("unicode-escape")))
15 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/pic_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import sys
5 | import uuid
6 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
7 |
8 | if sys.version_info < (3, 0):
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 |
12 |
13 | class PicPipeline(ItemPipeline):
14 | def process_item(self, item):
15 | if item is not None:
16 | if not os.path.exists("img"):
17 | os.mkdir("img")
18 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
19 | fs.write(item)
20 | print("download success!")
21 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/pipe_item.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | if sys.version_info < (3, 0):
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
7 |
8 |
9 | class pipeItem(object):
10 | def __init__(self, pipenames=[], result=None):
11 | self.pipenames = pipenames
12 | self.result = result
13 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/test_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
5 |
6 | if sys.version_info < (3, 0):
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 |
11 | class TestPipeline(ItemPipeline):
12 | def __init__(self):
13 | self.result = {}
14 |
15 | def process_item(self, item):
16 | self.result = item
17 |
--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/text_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
5 | from sasila.system_normal.utils import logger
6 | import traceback
7 | import codecs
8 |
9 | if sys.version_info < (3, 0):
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 |
14 | class TextPipeline(ItemPipeline):
15 | def process_item(self, item):
16 | with open("result.txt", 'a') as f:
17 | f.write(
18 | item["province"] + ',' +
19 | item["city"] + ',' +
20 | item["company_name"] + ',' +
21 | item["company_man"] + ',' +
22 | item["company_telephone"] + ',' +
23 | item["company_address"] + ',' +
24 | item["company_registered_capital"] + ',' +
25 | item["company_registered_time"] + ',' +
26 | item["company_status"] + ',' +
27 | item["source"] + ',' +
28 | item["update_time"] + "\n"
29 | )
30 |
31 |
32 | class TextPipelineCar(ItemPipeline):
33 | def process_item(self, item):
34 | try:
35 | with codecs.open("result.csv", 'a', 'gbk') as f:
36 | f.write(
37 | item["province"] + ',' +
38 | item["city"] + ',' +
39 | item["brand"].replace(u'\u30fb', '·') + ',' +
40 | item["cars_line"].replace(u'\u30fb', '·') + ',' +
41 | item["car"].replace(u'\u30fb', '·') + ',' +
42 | item["mileage"] + ',' +
43 | item["first_borad_date"] + ',' +
44 | item["gear"] + ',' +
45 | item["displacement"] + ',' +
46 | item["price"] + ',' +
47 | item["crawl_date"] + "\n"
48 | )
49 | except:
50 | logger.error(traceback.format_exc())
51 |
52 |
53 | class TextPipelineFang(ItemPipeline):
54 | def process_item(self, item):
55 | try:
56 | with codecs.open("fang.csv", 'a', 'gbk') as f:
57 | f.write(
58 | item["province"] + ',' +
59 | item["city"] + ',' +
60 | item["district"] + ',' +
61 | item["avg_price"] + ',' +
62 | item["estate"].replace(',', ',') + ',' +
63 | item["area"] + ',' +
64 | item["layout"] + ',' +
65 | item["total_price"] + ',' +
66 | item["crawl_date"] + ',' +
67 | item["url"] + "\n"
68 | )
69 | except:
70 | logger.error(traceback.format_exc())
71 |
72 |
73 | class TextPipelineFangShop(ItemPipeline):
74 | def process_item(self, item):
75 | try:
76 | with codecs.open("fang_shop.csv", 'a', 'gbk') as f:
77 | f.write(
78 | item["city"] + ',' +
79 | item["district"] + ',' +
80 | item["estate"].replace(',', ',') + ',' +
81 | item["floor"] + ',' +
82 | item["total_floor"] + ',' +
83 | item["type"] + ',' +
84 | item["area"] + ',' +
85 | item["total_price"] + ',' +
86 | item["crawl_date"] + ',' +
87 | item["url"] + "\n"
88 | )
89 | except:
90 | logger.error(traceback.format_exc())
91 |
92 |
93 | class TextPipelineBendibao(ItemPipeline):
94 | def process_item(self, item):
95 | try:
96 | with codecs.open("bendibao.csv", 'a', 'gbk') as f:
97 | f.write(
98 | item["city_name"] + ',' +
99 | item["category1_name"] + ',' +
100 | item["category2_name"] + ',' +
101 | item["result_name"] + ',' +
102 | item["result_mobile"] + "\n"
103 | )
104 | except:
105 | logger.error(traceback.format_exc())
106 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | if sys.version_info < (3, 0):
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/sasila/system_normal/processor/base_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import re
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.downloader.http.spider_request import Request
7 | from sasila.system_normal.utils.decorator import checkResponse
8 |
9 | if sys.version_info < (3, 0):
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 |
14 | def identity(x):
15 | return x
16 |
17 |
18 | class Rule(object):
19 | def __init__(self, link_extractor, callback=None, process_request=identity, priority=0, only_first=False):
20 | self.link_extractor = link_extractor
21 | self.callback = callback
22 | self.process_request = process_request
23 | self.priority = priority
24 | self.only_first = only_first
25 |
26 |
27 | class LinkExtractor(object):
28 | def __init__(self, regex_str=None, css_str=None, process_value=None):
29 | if regex_str:
30 | self.regex = re.compile(regex_str)
31 | else:
32 | self.regex = None
33 | self.css_str = css_str
34 | self.process_value = process_value
35 |
36 | @checkResponse
37 | def extract_links(self, response):
38 | if self.process_value:
39 | return [response.nice_join(link) for link in self.process_value(response.m_response.content)]
40 | elif self.regex:
41 | return [response.nice_join(link) for link in self.regex.findall(response.m_response.content)]
42 | elif self.css_str:
43 | soup = bs(response.m_response.content, 'lxml')
44 | tags = soup.select(self.css_str)
45 | return [response.nice_join(tag.attrs["href"]) for tag in tags]
46 |
47 |
48 | class BaseProcessor(object):
49 | spider_id = None
50 | spider_name = None
51 | start_requests = []
52 | rules = ()
53 | allowed_domains = []
54 |
55 | @checkResponse
56 | def process(self, response):
57 | if hasattr(self, 'rules'):
58 | rules = getattr(self, 'rules', None)
59 | else:
60 | rules = ()
61 | for rule in rules:
62 | links = rule.link_extractor.extract_links(response)
63 | if links:
64 | for link in links:
65 | request = Request(url=link, callback=rule.callback, priority=rule.priority)
66 | request = rule.process_request(request)
67 | yield request
68 | if rule.only_first:
69 | break
70 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/bendibao_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineBendibao
9 |
10 | from sasila.system_normal.processor.base_processor import BaseProcessor
11 | from sasila.system_normal.downloader.http.spider_request import Request
12 | from sasila.system_normal.utils.decorator import checkResponse
13 |
14 | if sys.version_info < (3, 0):
15 | reload(sys)
16 | sys.setdefaultencoding('utf-8')
17 |
18 | start_requests_temp = []
19 |
20 | with open(name='city.txt', mode='r') as fs:
21 | lines = fs.readlines()
22 | for line in lines:
23 | request_temp = Request(url=line.strip().split(',')[0] + 'wangdian/', priority=0)
24 | request_temp.meta["city_name"] = line.strip().split(',')[1]
25 | start_requests_temp.append(request_temp)
26 |
27 |
28 | class Bendibao_Processor(BaseProcessor):
29 | spider_id = 'bendibao_spider'
30 | spider_name = 'bendibao_spider'
31 | allowed_domains = ['bendibao.com']
32 | start_requests = start_requests_temp
33 |
34 | @checkResponse
35 | def process(self, response):
36 | soup = bs(response.m_response.content, 'lxml')
37 | category1 = soup.select('div.navlink')
38 | for category in category1:
39 | category1_name = category.select('div.title h2')[0].text
40 | category_2 = category.select('ul.topic li a')
41 | for category_2_one in category_2:
42 | url = response.nice_join(category_2_one['href']) + '/'
43 | category_2_name = category_2_one.text
44 | request = Request(url=url, priority=1, callback=self.process_page_1)
45 | request.meta['city_name'] = response.request.meta['city_name']
46 | request.meta['category1_name'] = category1_name
47 | request.meta['category2_name'] = category_2_name
48 | yield request
49 |
50 | @checkResponse
51 | def process_page_1(self, response):
52 | if '下暂无网点信息' not in response.m_response.content:
53 | soup = bs(response.m_response.content, 'lxml')
54 | results = soup.select('ul.catalist li')
55 | for result in results:
56 | result_name = result.select("div.infoschema h3 a")[0].text
57 | result_mobile = result.find(lambda tag: tag.name == 'p' and '电话:' in tag.text).text
58 | m_result = dict()
59 | m_result['result_name'] = result_name
60 | m_result['result_mobile'] = result_mobile.replace('电话:', '')
61 | m_result['city_name'] = response.request.meta['city_name']
62 | m_result['category1_name'] = response.request.meta['category1_name']
63 | m_result['category2_name'] = response.request.meta['city_name']
64 | yield m_result
65 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
66 | if next_page:
67 | url_splits = response.request.url.split('/')
68 | url_splits[-1] = next_page['href']
69 | url = '/'.join(url_splits)
70 | request = Request(url=url, priority=1, callback=self.process_page_1)
71 | request.meta['city_name'] = response.request.meta['city_name']
72 | request.meta['category1_name'] = response.request.meta['category1_name']
73 | request.meta['category2_name'] = response.request.meta['category2_name']
74 | yield request
75 |
76 |
77 | if __name__ == '__main__':
78 | SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline(TextPipelineBendibao()).set_pipeline(
79 | ConsolePipeline()).start()
80 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/car_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 |
9 | from sasila.system_normal.processor.base_processor import BaseProcessor
10 | from sasila.system_normal.downloader.http.spider_request import Request
11 | from sasila.system_normal.utils.decorator import checkResponse
12 | import json
13 | import time
14 |
15 | if sys.version_info < (3, 0):
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | class Car_Processor(BaseProcessor):
21 | spider_id = 'car_spider'
22 | spider_name = 'car_spider'
23 | allowed_domains = ['che168.com']
24 | start_requests = [Request(url='http://www.che168.com', priority=0)]
25 |
26 | @checkResponse
27 | def process(self, response):
28 | soup = bs(response.m_response.content, 'lxml')
29 | province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear')
30 | for province_div in province_div_list:
31 | province_name = province_div.select('span.capital a')[0].text
32 | city_list = province_div.select('div.city a')
33 | for city in city_list:
34 | city_name = city.text
35 | pinyin = city['href'].strip('/').split('/')[0]
36 | request = Request(
37 | url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin,
38 | priority=1, callback=self.process_page_1)
39 | request.meta['province'] = province_name
40 | request.meta['city'] = city_name
41 | yield request
42 |
43 | @checkResponse
44 | def process_page_1(self, response):
45 | brand_list = list(json.loads(response.m_response.content.decode('gb2312')))
46 | for brand in brand_list:
47 | brand_dict = dict(brand)
48 | brand_name = brand_dict['name']
49 | url = response.nice_join(brand_dict['url']) + '/'
50 | request = Request(url=url, priority=2, callback=self.process_page_2)
51 | request.meta['province'] = response.request.meta['province']
52 | request.meta['city'] = response.request.meta['city']
53 | request.meta['brand'] = brand_name
54 | yield request
55 |
56 | @checkResponse
57 | def process_page_2(self, response):
58 | soup = bs(response.m_response.content, 'lxml')
59 | cars_line_list = soup.select('div#series div.content-area dl.model-list dd a')
60 | for cars_line in cars_line_list:
61 | cars_line_name = cars_line.text
62 | url = 'http://www.che168.com' + cars_line['href']
63 | request = Request(url=url, priority=3, callback=self.process_page_3)
64 | request.meta['province'] = response.request.meta['province']
65 | request.meta['city'] = response.request.meta['city']
66 | request.meta['brand'] = response.request.meta['brand']
67 | request.meta['cars_line'] = cars_line_name
68 | yield request
69 |
70 | @checkResponse
71 | def process_page_3(self, response):
72 | soup = bs(response.m_response.content, 'lxml')
73 | car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
74 | for car_info in car_info_list:
75 | url = 'http://www.che168.com' + car_info['href']
76 | request = Request(url=url, priority=4, callback=self.process_page_4)
77 | request.meta['province'] = response.request.meta['province']
78 | request.meta['city'] = response.request.meta['city']
79 | request.meta['brand'] = response.request.meta['brand']
80 | request.meta['cars_line'] = response.request.meta['cars_line']
81 | yield request
82 | next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
83 | if next_page:
84 | url = 'http://www.che168.com' + next_page['href']
85 | request = Request(url=url, priority=3, callback=self.process_page_3)
86 | request.meta['province'] = response.request.meta['province']
87 | request.meta['city'] = response.request.meta['city']
88 | request.meta['brand'] = response.request.meta['brand']
89 | request.meta['cars_line'] = response.request.meta['cars_line']
90 | yield request
91 |
92 | @checkResponse
93 | def process_page_4(self, response):
94 | soup = bs(response.m_response.content, 'lxml')
95 | # Object moved
96 | # Object moved to here.
97 | #
98 | if len(soup.select('div.car-title h2')) != 0:
99 | car = soup.select('div.car-title h2')[0].text
100 | detail_list = soup.select('div.details li')
101 | if len(detail_list) == 0:
102 | soup = bs(response.m_response.content, 'html5lib')
103 | detail_list = soup.select('div.details li')
104 | mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
105 | first_borad_date = detail_list[1].select('span')[0].text
106 | gear = detail_list[2].select('span')[0].text.split('/')[0]
107 | displacement = detail_list[2].select('span')[0].text.split('/')[1]
108 | price = soup.select('div.car-price ins')[0].text.replace('¥', '')
109 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
110 |
111 | item = dict()
112 | item['car'] = car
113 | item['mileage'] = mileage
114 | item['first_borad_date'] = first_borad_date
115 | item['gear'] = gear
116 | item['displacement'] = displacement
117 | item['price'] = price
118 | item['crawl_date'] = crawl_date
119 |
120 | item['province'] = response.request.meta['province']
121 | item['city'] = response.request.meta['city']
122 | item['brand'] = response.request.meta['brand']
123 | item['cars_line'] = response.request.meta['cars_line']
124 | yield item
125 |
126 |
127 | if __name__ == '__main__':
128 | SpiderCore(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).start()
129 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/city.txt:
--------------------------------------------------------------------------------
1 | http://cd.bendibao.com/,成都
2 | http://my.bendibao.com/,绵阳
3 | http://deyang.bendibao.com/,德阳
4 | http://nanchong.bendibao.com/,南充
5 | http://yb.bendibao.com/,宜宾
6 | http://zg.bendibao.com/,自贡
7 | http://leshan.bendibao.com/,乐山
8 | http://luzhou.bendibao.com/,泸州
9 | http://dazhou.bendibao.com/,达州
10 | http://neijiang.bendibao.com/,内江
11 | http://suining.bendibao.com/,遂宁
12 | http://pzh.bendibao.com/,攀枝花
13 | http://ms.bendibao.com/,眉山
14 | http://ga.bendibao.com/,广安
15 | http://zy.bendibao.com/,资阳
16 | http://liangshan.bendibao.com/,凉山
17 | http://guangyuan.bendibao.com/,广元
18 | http://ya.bendibao.com/,雅安
19 | http://bazhong.bendibao.com/,巴中
20 | http://xichang.bendibao.com/,西昌
21 | http://ab.bendibao.com/,阿坝
22 | http://ganzi.bendibao.com/,甘孜
--------------------------------------------------------------------------------
/sasila/system_normal/processor/city_location_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.system_normal.spider.spider_core import SpiderCore
6 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
7 | from sasila.system_normal.downloader.http.spider_request import Request
8 | from bs4 import BeautifulSoup as bs
9 |
10 | if sys.version_info < (3, 0):
11 | reload(sys)
12 | sys.setdefaultencoding('utf-8')
13 |
14 |
15 | class CityLocationProcessor(BaseProcessor):
16 | spider_id = 'city'
17 | spider_name = 'city'
18 | allowed_domains = ['supfree.net']
19 | start_requests = [Request(url='http://jingwei.supfree.net/')]
20 |
21 | rules = (
22 | Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0),
23 | Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"), priority=1, only_first=True, callback='save'),
24 | )
25 |
26 | def save(self, response):
27 | if response.m_response:
28 | soup = bs(response.m_response.content, 'lxml')
29 | name = soup.select("div.cdiv p")[0].string.strip().split(' ')
30 | if len(name) > 2:
31 | province = name[0]
32 | city = name[1]
33 | area = name[2]
34 | elif len(name) > 1:
35 | province = name[0]
36 | city = name[0]
37 | area = name[1]
38 | else:
39 | province = name[0]
40 | city = name[0]
41 | area = name[0]
42 | lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
43 | la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
44 | data = province + ',' + city + ',' + area + ',' + lo + ',' + la
45 | print(data)
46 | with open('city.txt', 'a+') as fs:
47 | data = province + ',' + city + ',' + area + ',' + lo + ',' + la
48 | fs.write(data + '\n')
49 | print(data)
50 |
51 |
52 | # fe_spider = SpiderCore(CityLocationProcessor())
53 | # if __name__ == '__main__':
54 | # fe_spider.start()
55 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/fang_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang
9 |
10 | from sasila.system_normal.processor.base_processor import BaseProcessor
11 | from sasila.system_normal.downloader.http.spider_request import Request
12 | import time
13 | from sasila.system_normal.utils.decorator import checkResponse
14 |
15 | if sys.version_info < (3, 0):
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | class Fang_Processor(BaseProcessor):
21 | spider_id = 'fang_spider'
22 | spider_name = 'fang_spider'
23 | allowed_domains = ['fang.com']
24 | start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)]
25 |
26 | @checkResponse
27 | def process(self, response):
28 | soup = bs(response.m_response.content, 'lxml')
29 | province_list = {u'山西'}
30 | province_div_list = soup.select('div#c02 ul li')
31 | for province_div in province_div_list:
32 | province_name = province_div.select('strong')[0].text
33 | if province_name != '其他':
34 | if province_name in province_list:
35 | city_list = province_div.select('a')
36 | for city in city_list:
37 | city_name = city.text
38 | url = city['href']
39 | request = Request(url=url, priority=1, callback=self.process_page_1)
40 | request.meta['province'] = province_name
41 | request.meta['city'] = city_name
42 | yield request
43 |
44 | @checkResponse
45 | def process_page_1(self, response):
46 | soup = bs(response.m_response.content, 'lxml')
47 | district_list = soup.select('div.qxName a')
48 | district_list.pop(0)
49 | for district in district_list:
50 | district_name = district.text
51 | url = response.request.url + district['href']
52 | request = Request(url=url, priority=2, callback=self.process_page_2)
53 | request.meta['province'] = response.request.meta['province']
54 | request.meta['city'] = response.request.meta['city']
55 | request.meta['district'] = district_name
56 | yield request
57 |
58 | @checkResponse
59 | def process_page_2(self, response):
60 | soup = bs(response.m_response.content, 'lxml')
61 | avg_price_list = soup.select('div.newcardR dl')
62 | if len(avg_price_list) > 0:
63 | avg_price = avg_price_list[1].select('dd b')[0].text
64 | else:
65 | avg_price = '未知'
66 | detail_list = soup.select('div.houseList dl')
67 | for detail in detail_list:
68 | if len(detail.select('p.mt10 a span')) != 0:
69 | estate = detail.select('p.mt10 a span')[0].text
70 | area = detail.select('div.area p')[0].text.replace('㎡', '')
71 | layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
72 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
73 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
74 | item = dict()
75 | item['avg_price'] = avg_price
76 | item['estate'] = estate
77 | item['area'] = area
78 | item['layout'] = layout
79 | item['total_price'] = total_price
80 | item['crawl_date'] = crawl_date
81 |
82 | item['province'] = response.request.meta['province']
83 | item['city'] = response.request.meta['city']
84 | item['district'] = response.request.meta['district']
85 | item['url'] = response.request.url
86 | yield item
87 |
88 | next_page = soup.select('a#PageControl1_hlk_next')
89 | if len(next_page) > 0:
90 | url = response.nice_join(next_page[0]['href'])
91 | request = Request(url=url, priority=2, callback=self.process_page_2)
92 | request.meta['province'] = response.request.meta['province']
93 | request.meta['city'] = response.request.meta['city']
94 | request.meta['district'] = response.request.meta['district']
95 | yield request
96 |
97 |
98 | if __name__ == '__main__':
99 | spider = SpiderCore(Fang_Processor(), test=True).set_pipeline(ConsolePipeline()).start()
100 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/fang_shop_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFangShop
9 |
10 | from sasila.system_normal.processor.base_processor import BaseProcessor
11 | from sasila.system_normal.downloader.http.spider_request import Request
12 | import time
13 | from sasila.system_normal.utils.decorator import checkResponse
14 | from sasila.system_normal.utils import logger
15 |
16 | if sys.version_info < (3, 0):
17 | reload(sys)
18 | sys.setdefaultencoding('utf-8')
19 |
20 |
21 | class Fang_Shop_Processor(BaseProcessor):
22 | spider_id = 'fang_shop_spider'
23 | spider_name = 'fang_shop_spider'
24 | allowed_domains = ['fang.com']
25 | start_requests = [Request(url='http://shop.fang.com', priority=0)]
26 |
27 | @checkResponse
28 | def process(self, response):
29 | city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'}
30 | soup = bs('''北京
31 | 上海
32 | 广州
33 | 深圳
34 | 天津
35 | 重庆
36 | 成都
37 | 苏州
38 | 武汉
39 | 西安
40 | 东莞
41 | 昆明
42 | 杭州
43 | 济南
44 | 无锡
45 | 郑州
46 | 南昌
47 | 青岛
48 | 石家庄
49 | 南京
50 | 大连''', 'lxml')
51 | city__list = soup.select('a')
52 | for city in city__list:
53 | city_name = city.text
54 | if city_name in city_crawl_list:
55 | url = city['href']
56 | request = Request(url=url, priority=1, callback=self.process_page_1)
57 | request.meta['city'] = city_name
58 | yield request
59 |
60 | @checkResponse
61 | def process_page_1(self, response):
62 | soup = bs(response.m_response.content, 'lxml')
63 | district_list = soup.select('div.qxName a')
64 | district_list.pop(0)
65 | for district in district_list:
66 | district_name = district.text
67 | url = response.request.url + district['href']
68 | request = Request(url=url, priority=2, callback=self.process_page_2)
69 | request.meta['city'] = response.request.meta['city']
70 | request.meta['district'] = district_name
71 | yield request
72 |
73 | @checkResponse
74 | def process_page_2(self, response):
75 | soup = bs(response.m_response.content, 'lxml')
76 | detail_list = soup.select('div.houseList dl')
77 | for detail in detail_list:
78 | estate = detail.select('p.mt15 span.spName')[0].text
79 | detail_str = detail.select('p.mt10')[0].text
80 |
81 | temp_list = detail.select('p.mt10')[0].text.split('/')
82 | temp_list = [temp.strip() for temp in temp_list]
83 |
84 | if '购物中心/百货' not in detail_str and '层' in detail_str:
85 | m_type = temp_list[0].replace('类型:', '')
86 | floor = temp_list[1]
87 | total_floor = temp_list[2].replace('层', '')
88 | elif '购物中心/百货' not in detail_str and '层' not in detail_str:
89 | m_type = temp_list[0].strip().replace('类型:', '')
90 | floor = '未知'
91 | total_floor = '未知'
92 | elif '购物中心/百货' in detail_str and '层' not in detail_str:
93 | m_type = temp_list[0].replace('类型:', '') + temp_list[1]
94 | floor = '未知'
95 | total_floor = '未知'
96 | elif '购物中心/百货' in detail_str and '层' in detail_str:
97 | m_type = temp_list[0].replace('类型:', '') + temp_list[1]
98 | floor = temp_list[2]
99 | total_floor = temp_list[3].replace('层', '')
100 | else:
101 | logger.error('unexpective detail_str: ' + detail_str.strip())
102 |
103 | area = detail.select('div.area')[0].text.replace('㎡', '').replace('建筑面积', '')
104 | total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
105 | crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
106 |
107 | item = dict()
108 | item['estate'] = estate
109 | item['floor'] = floor
110 | item['total_floor'] = total_floor
111 | item['type'] = m_type
112 | item['area'] = area
113 | item['total_price'] = total_price
114 | item['crawl_date'] = crawl_date
115 |
116 | item['city'] = response.request.meta['city']
117 | item['district'] = response.request.meta['district']
118 | item['url'] = response.request.url
119 | yield item
120 |
121 | next_page = soup.select('a#PageControl1_hlk_next')
122 | if len(next_page) > 0:
123 | url = response.nice_join(next_page[0]['href']) + '/'
124 | request = Request(url=url, priority=2, callback=self.process_page_2)
125 | request.meta['city'] = response.request.meta['city']
126 | request.meta['district'] = response.request.meta['district']
127 | yield request
128 |
129 |
130 | # if __name__ == '__main__':
131 | # spider = SpiderCore(Fang_Shop_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(
132 | # TextPipelineFangShop()).start()
133 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/fe_loan_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.system_normal.spider.spider_core import SpiderCore
6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
7 |
8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
9 | from sasila.system_normal.downloader.http.spider_request import Request
10 | from bs4 import BeautifulSoup as bs
11 |
12 | if sys.version_info < (3, 0):
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 |
17 | class FeProcessor(BaseProcessor):
18 | spider_id = 'fe'
19 | spider_name = 'fe'
20 | allowed_domains = ['58.com']
21 | start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]
22 |
23 | rules = (
24 | Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0),
25 | Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
26 | Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'),
27 | )
28 |
29 | def save(self, response):
30 | if response.m_response:
31 | print(bs(response.m_response.content, 'lxml').title.string)
32 |
33 |
34 | # fe_spider = SpiderCore(FeProcessor()).set_pipeline(PicPipeline())
35 | # if __name__ == '__main__':
36 | # fe_spider.start()
37 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/first_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
8 |
9 | from sasila.system_normal.processor.base_processor import BaseProcessor
10 | from sasila.system_normal.downloader.http.spider_request import Request
11 |
12 | if sys.version_info < (3, 0):
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 |
17 | class FirstProcessor(BaseProcessor):
18 | spider_id = 'test'
19 | spider_name = 'test'
20 | allowed_domains = ['mzitu.com']
21 | start_requests = [Request(url="http://www.mzitu.com/")]
22 |
23 | def process(self, response):
24 | soup = bs(response.m_response.content, 'lxml')
25 | a_list = soup.select("a")
26 | for a in a_list:
27 | if "href" in a.attrs:
28 | url = response.nice_join(a["href"])
29 | yield {'url': url}
30 |
31 | # if __name__ == '__main__':
32 | # spider = SpiderCore(FirstProcessor()).set_pipeline(ConsolePipeline()).start()
33 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/mzitu_proccessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from bs4 import BeautifulSoup as bs
6 | from sasila.system_normal.spider.spider_core import SpiderCore
7 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
8 |
9 | from sasila.system_normal.processor.base_processor import BaseProcessor
10 | from sasila.system_normal.downloader.http.spider_request import Request
11 |
12 | if sys.version_info < (3, 0):
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 |
17 | class MezituProcessor(BaseProcessor):
18 | spider_id = 'mzitu'
19 | spider_name = 'mzitu'
20 | allowed_domains = ['mzitu.com', 'meizitu.net']
21 | start_requests = [Request(url='http://www.mzitu.com/xinggan')]
22 |
23 | def process(self, response):
24 | if response.m_response:
25 | soup = bs(response.m_response.content, "lxml")
26 | total_page = int(soup.select_one("a.next.page-numbers").find_previous_sibling().text)
27 | for page in range(1, total_page + 1):
28 | yield Request(url="http://www.mzitu.com/xinggan/page/" + str(page), callback=self.get_page_content)
29 |
30 | def get_page_content(self, response):
31 | if response.m_response:
32 | soup = bs(response.m_response.content, 'lxml')
33 | li_list = soup.select("div.postlist ul#pins li")
34 | for li in li_list:
35 | yield Request(url=li.select_one("a").attrs["href"], callback=self.get_pic, priority=1)
36 |
37 | def get_pic(self, response):
38 | if response.m_response:
39 | li_soup = bs(response.m_response.content, "lxml")
40 | if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) is not None:
41 | total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
42 | .find_previous_sibling().text)
43 | for page in range(1, total_page + 1):
44 | yield Request(url=response.request.url + "/" + str(page), callback=self.download_pic, priority=2)
45 |
46 | def download_pic(self, response):
47 | if response.m_response:
48 | href = bs(response.m_response.content, "lxml").select_one("div.main-image img").attrs["src"]
49 | yield Request(url=href, callback=self.download, priority=3)
50 |
51 | def download(self, response):
52 | if response.m_response:
53 | if response.m_response.status_code == 200:
54 | yield response.m_response.content
55 |
56 |
57 | # mzitu_spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline())
58 | #
59 | # if __name__ == '__main__':
60 | # spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline()).start()
61 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/mzitu_proccessor_regex.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.system_normal.spider.spider_core import SpiderCore
6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
7 |
8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
9 | from sasila.system_normal.downloader.http.spider_request import Request
10 | import os
11 | import uuid
12 |
13 | if sys.version_info < (3, 0):
14 | reload(sys)
15 | sys.setdefaultencoding('utf-8')
16 |
17 |
18 | class MezituProcessor(BaseProcessor):
19 | spider_id = 'mzitu'
20 | spider_name = 'mzitu'
21 | allowed_domains = ['mzitu.com', 'meizitu.net']
22 | start_requests = [Request(url='http://www.mzitu.com/xinggan/')]
23 |
24 | rules = (
25 | Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
26 | callback="save", priority=3),
27 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
28 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2),
29 | Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0),
30 | )
31 |
32 | def save(self, response):
33 | if response.m_response:
34 | if not os.path.exists("img"):
35 | os.mkdir("img")
36 | with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
37 | fs.write(response.m_response.content)
38 | print("download success!")
39 |
40 |
41 | # if __name__ == '__main__':
42 | # spider = SpiderCore(MezituProcessor(), batch_size=10).set_pipeline(PicPipeline()).start()
43 |
--------------------------------------------------------------------------------
/sasila/system_normal/processor/qcc_processor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | from sasila.system_normal.spider.spider_core import SpiderCore
6 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
7 | from sasila.system_normal.pipeline.text_pipeline import TextPipeline
8 | from sasila.system_normal.processor.base_processor import BaseProcessor
9 | from sasila.system_normal.downloader.http.spider_request import Request
10 | from bs4 import BeautifulSoup as bs
11 | import time
12 | from sasila.system_normal.utils import logger
13 |
14 | import traceback
15 |
16 | if sys.version_info < (3, 0):
17 | reload(sys)
18 | sys.setdefaultencoding('utf-8')
19 |
20 |
21 | class QccProcessor(BaseProcessor):
22 | spider_id = 'qcc'
23 | spider_name = 'qcc'
24 | allowed_domains = ['qichacha.com']
25 |
26 | start_requests = [
27 | Request(url='http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE')
28 | ]
29 |
30 | def process(self, response):
31 | if not response.m_response:
32 | logger.error(response.request.url)
33 | yield response.request
34 | if '