├── .coveragerc
├── .travis.yml
├── LICENSE
├── README-SETUP.rst
├── README.md
├── pic
    ├── feijishi.png
    ├── jiagou.png
    ├── jichu.png
    ├── jigou.png
    ├── jishi-huoqushuju.png
    ├── jishi-shouquan.png
    └── spider.jpg
├── requirements.txt
├── sasila-example
    ├── car_processor.py
    ├── fang_processor.py
    ├── main.py
    ├── proxy.txt
    └── settings.py
├── sasila
    ├── __init__.py
    ├── settings
    │   ├── __init__.py
    │   └── default_settings.py
    ├── system_instant
    │   ├── __init__.py
    │   ├── blueprints
    │   │   ├── __init__.py
    │   │   └── jd.py
    │   ├── crawler
    │   │   ├── __init__.py
    │   │   └── jd
    │   │   │   ├── __init__.py
    │   │   │   └── request.py
    │   ├── database
    │   │   ├── __init__.py
    │   │   └── jd_database.py
    │   ├── manager
    │   │   ├── __init__.py
    │   │   └── jd_manager.py
    │   └── settings.py
    ├── system_normal
    │   ├── __init__.py
    │   ├── blueprints
    │   │   ├── __init__.py
    │   │   └── slow_spiders.py
    │   ├── database
    │   │   └── __init__.py
    │   ├── downloader
    │   │   ├── __init__.py
    │   │   ├── base_downloder.py
    │   │   ├── http
    │   │   │   ├── __init__.py
    │   │   │   ├── selenium_response.py
    │   │   │   ├── spider_request.py
    │   │   │   └── spider_response.py
    │   │   ├── proxy
    │   │   │   ├── __init__.py
    │   │   │   └── proxy_pool.py
    │   │   ├── requests_downloader.py
    │   │   ├── selenium_downloader.py
    │   │   └── web_driver_pool.py
    │   ├── loginer
    │   │   ├── __init__.py
    │   │   ├── base_loginer.py
    │   │   └── jd_loginer.py
    │   ├── manager
    │   │   ├── __init__.py
    │   │   └── spider_manager.py
    │   ├── pipeline
    │   │   ├── __init__.py
    │   │   ├── base_pipeline.py
    │   │   ├── console_pipeline.py
    │   │   ├── kafa_pipeline.py
    │   │   ├── pic_pipeline.py
    │   │   ├── pipe_item.py
    │   │   ├── test_pipeline.py
    │   │   └── text_pipeline.py
    │   ├── processor
    │   │   ├── __init__.py
    │   │   ├── base_processor.py
    │   │   ├── bendibao_processor.py
    │   │   ├── car_processor.py
    │   │   ├── city.txt
    │   │   ├── city_location_processor.py
    │   │   ├── fang_processor.py
    │   │   ├── fang_shop_processor.py
    │   │   ├── fe_loan_processor.py
    │   │   ├── first_processor.py
    │   │   ├── mzitu_proccessor.py
    │   │   ├── mzitu_proccessor_regex.py
    │   │   ├── qcc_processor.py
    │   │   └── test_processor.py
    │   ├── scheduler
    │   │   ├── __init__.py
    │   │   ├── bloom_filter.py
    │   │   └── queue.py
    │   ├── spider
    │   │   ├── __init__.py
    │   │   └── spider_core.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── cookie.py
    │   │   ├── decorator.py
    │   │   ├── httpobj.py
    │   │   ├── jd_code.py
    │   │   ├── kafka_utils.py
    │   │   ├── progress_bar.py
    │   │   ├── python.py
    │   │   └── reqser.py
    └── system_web
    │   └── __init__.py
├── setup.py
└── tests
    ├── __init__.py
    └── test_processor.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source =
 3 |     sasila
 4 | parallel = True
 5 | 
 6 | [report]
 7 | omit =
 8 |     sasila/system_instant/*
 9 |     sasila/system_normal/processor/*
10 | 
11 | exclude_lines =
12 |     pragma: no cover
13 |     def __repr__
14 |     if self.debug:
15 |     if settings.DEBUG
16 |     raise AssertionError
17 |     raise NotImplementedError
18 |     if 0:
19 |     if __name__ == .__main__.:
20 |     except ImportError:
21 |     pass
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | cache: pip
 4 | python:
 5 |     - "2.7"
 6 |     - "3.5"
 7 | services:
 8 |     - redis-server
 9 | install:
10 |     - pip install --no-use-wheel lxml
11 |     - pip install --allow-all-external -e .[test]
12 |     - pip install coveralls
13 | script:
14 |     - coverage run setup.py test
15 | after_success:
16 |     - coverage combine
17 |     - coveralls
18 | 
19 | deploy:
20 |   provider: pypi
21 |   skip_cleanup: true
22 |   distributions: "sdist bdist_wheel"
23 |   user: 'DaVinciDW'
24 |   password:
25 |       secure: "Uh+Q37dRElSmZ1YxiGv9aeg59xyCJ6dSJ87L/P1dFowHx267dX4l9xU9v3skrQSBKEIe8JjCHHT3b9D//1daObWuJ1PHQ6IdD5pp7Lwl2CkBW1TOP9MjAcZV9F0udH3X986owP8KCuwoVJglLWch+3FtI7iNpdrlcIUXlgwS4eAAfF6DmUJG5BSiHDfCdvEyLE2D13MqyXqWNixU9FQ6/5IPfEVrJsW0W0s+fUnvPNSq/R4l9oHrkhUb+2oI7OwYcCG+wXz6KOZaSn69a/sOPRI3thfc9v2FWKsz+XvBhqvNA67q2Q1kHaIn+KZnct+ZJD2tK4NrEJznf4mBliLT31YVsvYHmnsfO34+3W5G+PVdywE2j63uKAFVRzWfYRBVD1UAr0yFuCPD3Ghh7GzHFXEZm5Tltbng2BZQT82BxY4B8IPgHUMf418wRiOBKDGPSoZiHBXVtjwbWez36HOaMenXurLMaoCDWsUzl4QIJF723L5fS/z5Xq8iOoMo+5bsEIfp6BpsYh33n1zL887p03IFJHRnFlCPjdZJ7cQnBV2HTPwUNrls6c8DzaMncUj5W203k48nHm6YhspeS+uIEIrz2eCOgYD5AjjeBRsZfXlG6+DC0+O7Srnuih61xR0vJXQ9PpYCoPI5BMgQo+xwJJz2BP5IX7IpZ2HWJHFKC0E="
26 |   on:
27 |     branch: master
28 |     tags: false
29 |     repo: DarkSand/Sasila
30 |     condition: $TRAVIS_PYTHON_VERSION = "2.7"
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2014 Binux
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README-SETUP.rst:
--------------------------------------------------------------------------------
  1 | Sasila
  2 | ======
  3 | .. image:: https://img.shields.io/badge/version-0.0.1-green.svg
  4 |    :target: https://pypi.python.org/pypi/Sasila
  5 |    :alt: Sasila Version
  6 | 
  7 | .. image:: https://img.shields.io/badge/pypi-v1.4.0-green.svg
  8 |    :target: https://pypi.python.org/pypi/Sasila
  9 |    :alt: Wheel Status
 10 | 
 11 | Overview
 12 | ========
 13 | sasila is a simple spider system
 14 | 
 15 | Install
 16 | =======
 17 | 
 18 | The quick way::
 19 | 
 20 |     pip install sasila
 21 | 
 22 | Tutorial
 23 | =======
 24 | car_processor.py::
 25 | 
 26 |     #!/usr/bin/env python
 27 |     # -*- coding: utf-8 -*-
 28 |     from sasila.system_normal.spider.spider_core import SpiderCore
 29 |     from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
 30 |     from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar
 31 |     from sasila.system_normal.processor.base_processor import BaseProcessor
 32 |     from sasila.system_normal.downloader.http.spider_request import Request
 33 |     from sasila.system_normal.utils.decorator import checkResponse
 34 | 
 35 |     from bs4 import BeautifulSoup as bs
 36 |     import json
 37 |     import time
 38 |     import sys
 39 | 
 40 |     reload(sys)
 41 |     sys.setdefaultencoding("utf-8")
 42 | 
 43 | 
 44 |     class Car_Processor(BaseProcessor):
 45 |         spider_id = "car_spider"
 46 |         spider_name = "car_spider"
 47 |         allowed_domains = ["che168.com"]
 48 |         start_requests = [Request(url="http://www.che168.com", priority=0)]
 49 | 
 50 |         @checkResponse
 51 |         def process(self, response):
 52 |             soup = bs(response.m_response.content, "lxml")
 53 |             province_div_list = soup.select("div.city-list div.cap-city > div.fn-clear")
 54 |             for province_div in province_div_list:
 55 |                 province_name = province_div.select("span.capital a")[0].text
 56 |                 city_list = province_div.select("div.city a")
 57 |                 for city in city_list:
 58 |                     city_name = city.text
 59 |                     pinyin = city["href"].strip("/").split("/")[0]
 60 |                     request = Request(
 61 |                             url="http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s" % pinyin,
 62 |                             priority=1, callback=self.process_page_1)
 63 |                     request.meta["province"] = province_name
 64 |                     request.meta["city"] = city_name
 65 |                     yield request
 66 | 
 67 |         @checkResponse
 68 |         def process_page_1(self, response):
 69 |             brand_list = list(json.loads(response.m_response.content.decode("gb2312")))
 70 |             for brand in brand_list:
 71 |                 brand_dict = dict(brand)
 72 |                 brand_name = brand_dict["name"]
 73 |                 url = response.nice_join(brand_dict["url"]) + "/"
 74 |                 request = Request(url=url, priority=2, callback=self.process_page_2)
 75 |                 request.meta["province"] = response.request.meta["province"]
 76 |                 request.meta["city"] = response.request.meta["city"]
 77 |                 request.meta["brand"] = brand_name
 78 |                 yield request
 79 | 
 80 |         @checkResponse
 81 |         def process_page_2(self, response):
 82 |             soup = bs(response.m_response.content, "lxml")
 83 |             cars_line_list = soup.select("div#series div.content-area dl.model-list dd a")
 84 |             for cars_line in cars_line_list:
 85 |                 cars_line_name = cars_line.text
 86 |                 url = "http://www.che168.com" + cars_line["href"]
 87 |                 request = Request(url=url, priority=3, callback=self.process_page_3)
 88 |                 request.meta["province"] = response.request.meta["province"]
 89 |                 request.meta["city"] = response.request.meta["city"]
 90 |                 request.meta["brand"] = response.request.meta["brand"]
 91 |                 request.meta["cars_line"] = cars_line_name
 92 |                 yield request
 93 | 
 94 |         @checkResponse
 95 |         def process_page_3(self, response):
 96 |             soup = bs(response.m_response.content, "lxml")
 97 |             car_info_list = soup.select("div#a2 ul#viewlist_ul li a.carinfo")
 98 |             for car_info in car_info_list:
 99 |                 url = "http://www.che168.com" + car_info["href"]
100 |                 request = Request(url=url, priority=4, callback=self.process_page_4)
101 |                 request.meta["province"] = response.request.meta["province"]
102 |                 request.meta["city"] = response.request.meta["city"]
103 |                 request.meta["brand"] = response.request.meta["brand"]
104 |                 request.meta["cars_line"] = response.request.meta["cars_line"]
105 |                 yield request
106 |             next_page = soup.find(lambda tag: tag.name == "a" and "下一页" in tag.text)
107 |             if next_page:
108 |                 url = "http://www.che168.com" + next_page["href"]
109 |                 request = Request(url=url, priority=3, callback=self.process_page_3)
110 |                 request.meta["province"] = response.request.meta["province"]
111 |                 request.meta["city"] = response.request.meta["city"]
112 |                 request.meta["brand"] = response.request.meta["brand"]
113 |                 request.meta["cars_line"] = response.request.meta["cars_line"]
114 |                 yield request
115 | 
116 |         @checkResponse
117 |         def process_page_4(self, response):
118 |             soup = bs(response.m_response.content, "lxml")
119 |             # <html><head><title>Object moved</title></head><body>
120 |             # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&amp;backurl=/&amp;infoid=21415515">here</a>.</h2>
121 |             # </body></html>
122 |             if len(soup.select("div.car-title h2")) != 0:
123 |                 car = soup.select("div.car-title h2")[0].text
124 |                 detail_list = soup.select("div.details li")
125 |                 if len(detail_list) == 0:
126 |                     soup = bs(response.m_response.content, "html5lib")
127 |                     detail_list = soup.select("div.details li")
128 |                 mileage = detail_list[0].select("span")[0].text.replace("万公里", "")
129 |                 first_borad_date = detail_list[1].select("span")[0].text
130 |                 gear = detail_list[2].select("span")[0].text.split("／")[0]
131 |                 displacement = detail_list[2].select("span")[0].text.split("／")[1]
132 |                 price = soup.select("div.car-price ins")[0].text.replace("￥", "")
133 |                 crawl_date = time.strftime("%Y-%m-%d", time.localtime(time.time()))
134 | 
135 |                 item = dict()
136 |                 item["car"] = car
137 |                 item["mileage"] = mileage
138 |                 item["first_borad_date"] = first_borad_date
139 |                 item["gear"] = gear
140 |                 item["displacement"] = displacement
141 |                 item["price"] = price
142 |                 item["crawl_date"] = crawl_date
143 | 
144 |                 item["province"] = response.request.meta["province"]
145 |                 item["city"] = response.request.meta["city"]
146 |                 item["brand"] = response.request.meta["brand"]
147 |                 item["cars_line"] = response.request.meta["cars_line"]
148 |                 yield item
149 | 
150 | main.py::
151 | 
152 |     #!/usr/bin/env python
153 |     # -*- coding: utf-8 -*-
154 |     from car_processor import Car_Processor
155 |     from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
156 |     from sasila.system_normal.spider.spider_core import SpiderCore
157 |     from sasila.system_normal.manager import manager
158 |     import sasila
159 | 
160 |     spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline())
161 |     manager.set_spider(spider_car)
162 |     sasila.start()
163 | 
164 | then start your redis and run script::
165 | 
166 |     python main.py
167 | 
168 | then start your spider in your browser::
169 | 
170 |     http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider
171 | 
172 | you can stop spider::
173 | 
174 |     http://127.0.0.1:5000/slow_spider/start?spider_id=car_spider
175 | 
176 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sasila  [![PyPI Version]][PyPI] [![Build Status]][Travis CI] [![Coverage Status]][Coverage]
  2 | 
  3 | &emsp;&emsp;现在有很多爬虫框架，比如[**scrapy**](https://github.com/scrapy/scrapy)、[**webmagic**](https://github.com/code4craft/webmagic)、[**pyspider**](https://github.com/binux/pyspider)都可以在爬虫工作中使用，也可以直接通过[**requests**](https://github.com/requests/requests)+[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)来写一些个性化的小型爬虫脚本。但是在实际爬取过程当中，爬虫框架各自有优势和缺陷。比如scrapy，它的功能强大，但过于强大的功能也许反而让新手无所适从，并且它采用twisted异步框架开发，对新手来说源码难以理解，项目难于调试。所以我模仿这些爬虫框架的优势，以尽量简单的原则，搭配gevent(实际上是grequests)开发了这套轻量级爬虫框架。
  4 | 
  5 | ![jiagou](https://github.com/DarkSand/Sasila/blob/master/pic/jigou.png)
  6 | 
  7 | * downloader是下载器。
  8 | * processor是解析器。
  9 | * scheduler是调度器。
 10 | * pipeline是数据处理器。
 11 | * 将下载器，解析器，调度器，数据处理器注入核心core成为spider对象。
 12 | * 通过manager管理spider对象。
 13 | * manager透过webapi提供外部访问/控制接口。
 14 | 
 15 | ## **主要特点**
 16 | 
 17 | * 框架代码结构简单易用，易于修改。新手、老鸟皆可把控。
 18 | * 采用gevent实现并发操作，与scrapy的twisted相比，代码更容易理解。
 19 | * 完全模块化的设计，强大的可扩展性。
 20 | * 使用方式和结构参考了[**scrapy**](https://github.com/scrapy/scrapy)和[**webmagic**](https://github.com/code4craft/webmagic)。对有接触过这两个框架的朋友非常友好。
 21 | * 不采用命令行来启动爬虫，方便调试。
 22 | * 对数据的解析模块并没有集成，可以自由使用[**beautifulsoup**](https://github.com/il-vladislav/BeautifulSoup4)、[**lxml**](https://github.com/lxml/lxml)、[**pyquery**](https://github.com/gawel/pyquery)、[**html5lib**](https://github.com/html5lib/html5lib-python)等等各种解析器进行数据抽取。
 23 | * 集成代理换IP功能。
 24 | * 支持高并发抓取数据。
 25 | * 支持分布式。
 26 | * 支持增量爬取。
 27 | * 支持爬取js动态渲染的页面(加载SeleniumDownLoader即可)。
 28 | * 提供webapi对爬虫进行管理、监控。
 29 | * 提供即时爬虫的集成思路和结构。
 30 | 
 31 | ## **安装**
 32 | ```
 33 | pip install sasila
 34 | ```
 35 | ## **准备**
 36 | * 请准备好您的redis服务器进行调度。
 37 | * 并在settings.py文件中 写入您的redis服务器地址
 38 | ```python
 39 | REDIS_HOST = 'localhost'
 40 | REDIS_PORT = 6379
 41 | ```
 42 | ## **构建processor(解析器)**
 43 | ```python
 44 | #!/usr/bin/env python
 45 | # -*- coding: utf-8 -*-
 46 | from bs4 import BeautifulSoup as bs
 47 | from sasila.system_normal.processor.base_processor import BaseProcessor
 48 | from sasila.system_normal.downloader.http.spider_request import Request
 49 | from sasila.system_normal.spider.spider_core import SpiderCore
 50 | 
 51 | class Mzi_Processor(BaseProcessor):
 52 |     spider_id = 'mzi_spider'
 53 |     spider_name = 'mzi_spider'
 54 |     allowed_domains = ['mzitu.com']
 55 |     start_requests = [Request(url='http://www.mzitu.com/', priority=0)]
 56 | 
 57 |     @checkResponse
 58 |     def process(self, response):
 59 |         soup = bs(response.m_response.content, 'lxml')
 60 |         print soup.title.string
 61 |         href_list = soup.select('a')
 62 |         for href in href_list:
 63 |             yield Request(url=response.nice_join(href['href']))
 64 | ```
 65 | **写法与scrapy几乎一样**
 66 | 
 67 | * 所有的解析器都继承自 *BaseProcessor* ，默认入口解析函数为def process(self, response)。
 68 | * 为该解析器设置spider_id和spider_name,以及限定域名。
 69 | * 初始爬取请求为 *start_requests*，构建Request对象，该对象支持GET、POST方法，支持优先级，设置回调函数等等所有构建request对象的一切属性。默认回调函数为 *process*。
 70 | * 可以使用@checkResponse装饰器对返回的 *response* 进行校验并记录异常日志。你也可以定义自己的装饰器。
 71 | * 解析函数因为使用 *yield* 关键字，所以是一个生成器。当 *yield* 返回 *Request* 对象，则会将 *Request* 对象推入调度器等待调度继续进行爬取。若 *yield* 不是返回 *Request* 对象则会进入 *pipeline* ， *pipeline* 将对数据进行清洗入库等操作。
 72 | 
 73 | **与scrapy相似，sasila同样提供*LinkExtractor的*方式来提取链接，以下是用*LinkExtractor*的方式构造*processor*下载妹子图的示例**
 74 | 
 75 | ```python
 76 | #!/usr/bin/env python
 77 | # -*- coding: utf-8 -*-
 78 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
 79 | from sasila.system_normal.downloader.http.spider_request import Request
 80 | import os
 81 | import uuid
 82 | 
 83 | class MezituProcessor(BaseProcessor):
 84 |     spider_id = 'mzitu'
 85 |     spider_name = 'mzitu'
 86 |     allowed_domains = ['mzitu.com', 'meizitu.net']
 87 |     start_requests = [Request(url='http://www.mzitu.com/xinggan/')]
 88 | 
 89 |     rules = (
 90 |         Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),callback="save", priority=3),
 91 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
 92 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2),
 93 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0),
 94 |     )
 95 | 
 96 |     def save(self, response):
 97 |         if response.m_response:
 98 |             if not os.path.exists("img"):
 99 |                 os.mkdir("img")
100 |             with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
101 |                 fs.write(response.m_response.content)
102 |                 print("download success!")
103 | ```
104 | 
105 | **LinkExtractor的构造方式为**
106 | 
107 | ```python
108 | LinkExtractor(regex_str=None, css_str=None, process_value=None)
109 | ```
110 | 
111 | * 提供正则表达式提取方式：*regex_str*
112 | * 提供css选择器提取方式：*css_str*
113 | * 也可以自定义*process_value*来提取链接，其中*process_value*是一个生成器
114 | * 若使用此方式构造*processor*，请不要定义默认入口函数def process(self, response)
115 | 
116 | 
117 | ## **构建pipeline**
118 | 该pipeline获取数据后将数据转为json格式，并输出到屏幕
119 | ```python
120 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
121 | 
122 | class ConsolePipeline(ItemPipeline):
123 |     def process_item(self, item):
124 |         print json.dumps(item).decode("unicode-escape")
125 | ```
126 | ## **构建spider(爬虫对象）**
127 | * 通过注入 *processor* 生成spider对象
128 | ```python
129 | from sasila.system_normal.spider.spider_core import SpiderCore
130 | 
131 | spider = SpiderCore(Mzi_Processor())
132 | ```
133 | * RequestSpider对象包含批下载数量 *batch_size*，下载间隔 *time_sleep*，使用代理 *use_proxy* 等一切必要的属性
134 | ```python
135 | SpiderCore(processor=None, downloader=None, use_proxy=False,scheduler=None,batch_size=None,time_sleep=None)
136 | ```
137 | * 本项目集成使用代理IP的功能，只要在构建RequestSpider时将  *use_proxy* 设置为 *True*,并在脚本同级目录下放置proxy.txt文件即可。你也可以在settings.py文件中写入代理IP文件路径。
138 | ```python
139 | PROXY_PATH_REQUEST = 'proxy/path'
140 | ```
141 | * proxy.txt文件中请写入代理IP，格式为：IP,端口号。若该代理IP有账号密码，在末尾追加账号密码即可。
142 | ```text
143 | 127.0.0.1,8080
144 | 127.0.0.2,8080,user,pwd
145 | 127.0.0.3,8080,user,pwd
146 | ```
147 | * RequestSpider已经默认设置好了 *downloader* 和 *scheduler*，如果不满意，可以自己进行定制。
148 | * 可以为spider设置 *downloader* 和 *pipeline* 甚至 *scheduler*
149 | ```python
150 |  spider = spider.set_pipeline(ConsolePipeline())
151 | ```
152 | * 可以通过该方式启动爬虫
153 | ```python
154 | spider.start()
155 | ```
156 | * 也可以将spider注入*manager*进行管理
157 | ```python
158 | from sasila.system_normal.manager import manager
159 | from sasila import system_web
160 | 
161 | manager.set_spider(spider)
162 | 
163 | system_web.start()
164 | ```
165 | 
166 | 访问 http://127.0.0.1:5000/slow_spider/start?spider_id=mzi_spider 来启动爬虫。
167 | 
168 | 访问 http://127.0.0.1:5000/slow_spider/stop?spider_id=mzi_spider 来停止爬虫。
169 | 
170 | 访问 http://127.0.0.1:5000/slow_spider/detail?spider_id=mzi_spider 来查看爬虫详细信息。
171 | 
172 | ## **针对需要登录才能爬取的处理办法**
173 | * 可以为downloader加载登录器(loginer),在使用downloader的时候使用loginer进行登录获取cookies,再进行爬取
174 | * 也可以自己定义一个cookie池，批量进行登录并将登录成功的cookies放进cookie池中随时进行取用。项目中暂时没有这些功能。欢迎pull request~
175 | 
176 | ## **架构**
177 | ![jichu](https://github.com/DarkSand/Sasila/blob/master/pic/jichu.png)
178 | 
179 | * 任务由 scheduler 发起调度，downloader 抓取网页内容， processor 执行预先编写的py脚本，输出结果或产生新的提链任务（发往 scheduler），形成闭环。
180 | * 每个脚本被认为是一个spider，spiderid确定一个任务。
181 | * downloader
182 | 1. method, header, cookie, proxy,timeout 等等抓取调度控制。
183 | 2. 可以通过适配类似 phantomjs 的webkit引擎支持渲染。
184 | * processor
185 | 1. 灵活运用pyquery，beautifulsoup等解析页面。
186 | 2. 在脚本中完全控制调度抓取的各项参数。
187 | 3. 可以向后链传递信息。
188 | 4. 异常捕获。
189 | * scheduler
190 | 1. 任务优先级。
191 | 2. 对任务进行监控。
192 | 3. 对任务进行去重等操作。
193 | 4. 支持增量。
194 | * webApi
195 | 1. 对爬虫进行增删改查等操作。
196 | * 非及时爬虫流程图
197 | 
198 | ![feijishi](https://github.com/DarkSand/Sasila/blob/master/pic/feijishi.png)
199 | 
200 | ## **即时爬虫**
201 | 即时爬虫是可以通过api调用，传入需要爬取的页面或者需求，即时爬取数据并返回结果。现阶段开发并不完善。仅提供思路参考。示例核心代码在 *sasila.system_instant* 中。
202 | 
203 | * 即时爬虫-获取数据流程图
204 | 
205 | ![huoqushuju](https://github.com/DarkSand/Sasila/blob/master/pic/jishi-huoqushuju.png)
206 | 
207 | * 即时爬虫-授权流程图
208 | 
209 | ![shouquan](https://github.com/DarkSand/Sasila/blob/master/pic/jishi-shouquan.png)
210 | 
211 | ## **为啥叫Sasila？**
212 | 
213 | ![spider](https://github.com/DarkSand/Sasila/blob/master/pic/spider.jpg)
214 | 
215 | 作为一个wower,你可以猜到吗ヾ(￣▽￣)
216 | 
217 | ## **fetchman**
218 | 
219 | 现提供更好用的爬虫框架[**fetchman**](https://github.com/DarkSand/fetchman)，在sasila的基础上做了更多优化和修复并移除web相关功能（个人感觉有点鸡肋）。
220 | 
221 | 
222 | [Build Status]:         https://img.shields.io/travis/DarkSand/Sasila.svg?branch=master&style=flat
223 | [Travis CI]:            https://travis-ci.org/DarkSand/Sasila
224 | [Coverage Status]:      https://img.shields.io/coveralls/DarkSand/Sasila.svg?branch=master&style=flat
225 | [Coverage]:             https://coveralls.io/github/DarkSand/Sasila
226 | [PyPI Version]:         https://img.shields.io/pypi/v/Sasila.svg
227 | [PyPI]:                 https://pypi.python.org/pypi/sasila
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/pic/feijishi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/feijishi.png


--------------------------------------------------------------------------------
/pic/jiagou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jiagou.png


--------------------------------------------------------------------------------
/pic/jichu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jichu.png


--------------------------------------------------------------------------------
/pic/jigou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jigou.png


--------------------------------------------------------------------------------
/pic/jishi-huoqushuju.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-huoqushuju.png


--------------------------------------------------------------------------------
/pic/jishi-shouquan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/jishi-shouquan.png


--------------------------------------------------------------------------------
/pic/spider.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Sasila/c765f07952a4d542d214415b3e9d2a06db16cec5/pic/spider.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.11.1
 2 | redis==2.10.5
 3 | requests==2.13.0
 4 | six==1.10.0
 5 | SQLAlchemy==1.1.4
 6 | grequests==0.3.0
 7 | selenium==2.53.6
 8 | lxml==3.7.2
 9 | beautifulsoup4==4.6.0
10 | 


--------------------------------------------------------------------------------
/sasila-example/car_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from sasila.system_normal.spider.spider_core import SpiderCore
  4 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  5 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineCar
  6 | from sasila.system_normal.processor.base_processor import BaseProcessor
  7 | from sasila.system_normal.downloader.http.spider_request import Request
  8 | from sasila.system_normal.utils.decorator import checkResponse
  9 | 
 10 | from bs4 import BeautifulSoup as bs
 11 | import json
 12 | import time
 13 | import sys
 14 | 
 15 | if sys.version_info < (3, 0):
 16 |     reload(sys)
 17 |     sys.setdefaultencoding('utf-8')
 18 | 
 19 | 
 20 | class Car_Processor(BaseProcessor):
 21 |     spider_id = 'car_spider'
 22 |     spider_name = 'car_spider'
 23 |     allowed_domains = ['che168.com']
 24 |     start_requests = [Request(url='http://www.che168.com', priority=0)]
 25 | 
 26 |     @checkResponse
 27 |     def process(self, response):
 28 |         soup = bs(response.m_response.content, 'lxml')
 29 |         province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear')
 30 |         for province_div in province_div_list:
 31 |             province_name = province_div.select('span.capital a')[0].text
 32 |             city_list = province_div.select('div.city a')
 33 |             for city in city_list:
 34 |                 city_name = city.text
 35 |                 pinyin = city['href'].strip('/').split('/')[0]
 36 |                 request = Request(
 37 |                         url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin,
 38 |                         priority=1, callback=self.process_page_1)
 39 |                 request.meta['province'] = province_name
 40 |                 request.meta['city'] = city_name
 41 |                 yield request
 42 | 
 43 |     @checkResponse
 44 |     def process_page_1(self, response):
 45 |         brand_list = list(json.loads(response.m_response.content.decode('gb2312')))
 46 |         for brand in brand_list:
 47 |             brand_dict = dict(brand)
 48 |             brand_name = brand_dict['name']
 49 |             url = response.nice_join(brand_dict['url']) + '/'
 50 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 51 |             request.meta['province'] = response.request.meta['province']
 52 |             request.meta['city'] = response.request.meta['city']
 53 |             request.meta['brand'] = brand_name
 54 |             yield request
 55 | 
 56 |     @checkResponse
 57 |     def process_page_2(self, response):
 58 |         soup = bs(response.m_response.content, 'lxml')
 59 |         cars_line_list = soup.select('div#series div.content-area dl.model-list dd a')
 60 |         for cars_line in cars_line_list:
 61 |             cars_line_name = cars_line.text
 62 |             url = 'http://www.che168.com' + cars_line['href']
 63 |             request = Request(url=url, priority=3, callback=self.process_page_3)
 64 |             request.meta['province'] = response.request.meta['province']
 65 |             request.meta['city'] = response.request.meta['city']
 66 |             request.meta['brand'] = response.request.meta['brand']
 67 |             request.meta['cars_line'] = cars_line_name
 68 |             yield request
 69 | 
 70 |     @checkResponse
 71 |     def process_page_3(self, response):
 72 |         soup = bs(response.m_response.content, 'lxml')
 73 |         car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
 74 |         for car_info in car_info_list:
 75 |             url = 'http://www.che168.com' + car_info['href']
 76 |             request = Request(url=url, priority=4, callback=self.process_page_4)
 77 |             request.meta['province'] = response.request.meta['province']
 78 |             request.meta['city'] = response.request.meta['city']
 79 |             request.meta['brand'] = response.request.meta['brand']
 80 |             request.meta['cars_line'] = response.request.meta['cars_line']
 81 |             yield request
 82 |         next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
 83 |         if next_page:
 84 |             url = 'http://www.che168.com' + next_page['href']
 85 |             request = Request(url=url, priority=3, callback=self.process_page_3)
 86 |             request.meta['province'] = response.request.meta['province']
 87 |             request.meta['city'] = response.request.meta['city']
 88 |             request.meta['brand'] = response.request.meta['brand']
 89 |             request.meta['cars_line'] = response.request.meta['cars_line']
 90 |             yield request
 91 | 
 92 |     @checkResponse
 93 |     def process_page_4(self, response):
 94 |         soup = bs(response.m_response.content.decode('gb2312', 'ignore'), 'lxml')
 95 |         # <html><head><title>Object moved</title></head><body>
 96 |         # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&amp;backurl=/&amp;infoid=21415515">here</a>.</h2>
 97 |         # </body></html>
 98 |         if len(soup.select('div.car-title h2')) != 0:
 99 |             car = soup.select('div.car-title h2')[0].text
100 |             detail_list = soup.select('div.details li')
101 |             if len(detail_list) == 0:
102 |                 soup = bs(response.m_response.content, 'html5lib')
103 |                 detail_list = soup.select('div.details li')
104 |             mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
105 |             first_borad_date = detail_list[1].select('span')[0].text
106 |             gear = detail_list[2].select('span')[0].text.split('／')[0]
107 |             displacement = detail_list[2].select('span')[0].text.split('／')[1]
108 |             price = soup.select('div.car-price ins')[0].text.replace('￥', '')
109 |             crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
110 | 
111 |             item = dict()
112 |             item['car'] = car
113 |             item['mileage'] = mileage
114 |             item['first_borad_date'] = first_borad_date
115 |             item['gear'] = gear
116 |             item['displacement'] = displacement
117 |             item['price'] = price
118 |             item['crawl_date'] = crawl_date
119 | 
120 |             item['province'] = response.request.meta['province']
121 |             item['city'] = response.request.meta['city']
122 |             item['brand'] = response.request.meta['brand']
123 |             item['cars_line'] = response.request.meta['cars_line']
124 |             yield item
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
129 | 


--------------------------------------------------------------------------------
/sasila-example/fang_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | 
  5 | from bs4 import BeautifulSoup as bs
  6 | from sasila.system_normal.spider.spider_core import SpiderCore
  7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang
  9 | 
 10 | from sasila.system_normal.processor.base_processor import BaseProcessor
 11 | from sasila.system_normal.downloader.http.spider_request import Request
 12 | import time
 13 | from sasila.system_normal.utils.decorator import checkResponse
 14 | 
 15 | if sys.version_info < (3, 0):
 16 |     reload(sys)
 17 |     sys.setdefaultencoding('utf-8')
 18 | 
 19 | 
 20 | class Fang_Processor(BaseProcessor):
 21 |     spider_id = 'fang_spider'
 22 |     spider_name = 'fang_spider'
 23 |     allowed_domains = ['fang.com']
 24 |     start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)]
 25 | 
 26 |     @checkResponse
 27 |     def process(self, response):
 28 |         soup = bs(response.m_response.content, 'lxml')
 29 |         province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'}
 30 |         province_div_list = soup.select('div#c02 ul li')
 31 |         for province_div in province_div_list:
 32 |             province_name = province_div.select('strong')[0].text
 33 |             if province_name != '其他':
 34 |                 if province_name in province_list:
 35 |                     city_list = province_div.select('a')
 36 |                     for city in city_list:
 37 |                         city_name = city.text
 38 |                         url = city['href']
 39 |                         request = Request(url=url, priority=1, callback=self.process_page_1)
 40 |                         request.meta['province'] = province_name
 41 |                         request.meta['city'] = city_name
 42 |                         yield request
 43 | 
 44 |     @checkResponse
 45 |     def process_page_1(self, response):
 46 |         soup = bs(response.m_response.content, 'lxml')
 47 |         district_list = soup.select('div.qxName a')
 48 |         district_list.pop(0)
 49 |         for district in district_list:
 50 |             district_name = district.text
 51 |             url = response.request.url + district['href']
 52 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 53 |             request.meta['province'] = response.request.meta['province']
 54 |             request.meta['city'] = response.request.meta['city']
 55 |             request.meta['district'] = district_name
 56 |             yield request
 57 | 
 58 |     @checkResponse
 59 |     def process_page_2(self, response):
 60 |         soup = bs(response.m_response.content, 'lxml')
 61 |         avg_price_list = soup.select('div.newcardR dl')
 62 |         if len(avg_price_list) > 0:
 63 |             avg_price = avg_price_list[1].select('dd b')[0].text
 64 |         else:
 65 |             avg_price = '未知'
 66 |         detail_list = soup.select('div.houseList dl')
 67 |         for detail in detail_list:
 68 |             if len(detail.select('p.mt10 a span')) != 0:
 69 |                 estate = detail.select('p.mt10 a span')[0].text
 70 |                 area = detail.select('div.area p')[0].text.replace('㎡', '')
 71 |                 layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
 72 |                 total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
 73 |                 crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
 74 |                 item = dict()
 75 |                 item['avg_price'] = avg_price
 76 |                 item['estate'] = estate
 77 |                 item['area'] = area
 78 |                 item['layout'] = layout
 79 |                 item['total_price'] = total_price
 80 |                 item['crawl_date'] = crawl_date
 81 | 
 82 |                 item['province'] = response.request.meta['province']
 83 |                 item['city'] = response.request.meta['city']
 84 |                 item['district'] = response.request.meta['district']
 85 |                 item['url'] = response.request.url
 86 |                 yield item
 87 | 
 88 |         next_page = soup.select('a#PageControl1_hlk_next')
 89 |         if len(next_page) > 0:
 90 |             url = response.nice_join(next_page[0]['href'])
 91 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 92 |             request.meta['province'] = response.request.meta['province']
 93 |             request.meta['city'] = response.request.meta['city']
 94 |             request.meta['district'] = response.request.meta['district']
 95 |             yield request
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     spider = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
100 | 


--------------------------------------------------------------------------------
/sasila-example/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import os
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from car_processor import Car_Processor
 9 | from fang_processor import Fang_Processor
10 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
11 | from sasila.system_normal.spider.spider_core import SpiderCore
12 | from sasila.system_normal.manager import manager
13 | from sasila import system_web
14 | 
15 | if __name__ == '__main__':
16 |     spider_car = SpiderCore(Car_Processor(),batch_size=100).set_pipeline(ConsolePipeline())
17 |     spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline())
18 |     manager.set_spider(spider_car)
19 |     manager.set_spider(spider_fang)
20 |     system_web.start()
21 | 


--------------------------------------------------------------------------------
/sasila-example/proxy.txt:
--------------------------------------------------------------------------------
1 | 127.0.0.1,8888


--------------------------------------------------------------------------------
/sasila-example/settings.py:
--------------------------------------------------------------------------------
 1 | # settings
 2 | 
 3 | # phantomjs'useragent
 4 | # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
 5 | 
 6 | # phantomjs'path
 7 | # PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe'
 8 | 
 9 | # phantomjs'service
10 | # PHANTOMJS_SERVICE = [
11 | #     '--proxy=localhost:8888',
12 | #     '--proxy-type=http',
13 | #     # '--proxy-auth=username:password'
14 | # ]
15 | 
16 | # phatomjs'pool size
17 | # DRIVER_POOL_SIZE = 5
18 | 
19 | # proxy'path
20 | # PROXY_PATH_REQUEST = 'proxy/path'
21 | 
22 | # redis host
23 | # REDIS_HOST = 'localhost'
24 | 
25 | # redis port
26 | REDIS_PORT = 6379
27 | 


--------------------------------------------------------------------------------
/sasila/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sasila.settings
4 | 
5 | __version__ = '0.0.26'
6 | 


--------------------------------------------------------------------------------
/sasila/settings/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import imp
 4 | import sys
 5 | import os
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 | 
11 | import sasila.settings.default_settings
12 | 
13 | setting_path = os.path.join(os.getcwd(), 'settings.py')
14 | 
15 | # 如果运行目录存在settings.py文件，则对默认设置进行覆写
16 | if os.path.exists(setting_path):
17 |     new_settings = imp.load_source('settings', setting_path)
18 | 
19 |     new_settings_dict = dict()
20 |     for key in dir(new_settings):
21 |         if key.isupper():
22 |             new_settings_dict[key] = getattr(new_settings, key)
23 |     if sys.version_info < (3, 0):
24 |         for key, value in new_settings_dict.iteritems():
25 |             setattr(default_settings, key, value)
26 |     else:
27 |         for key, value in new_settings_dict.items():
28 |             setattr(default_settings, key, value)


--------------------------------------------------------------------------------
/sasila/settings/default_settings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import os
 5 | 
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 | 
10 | BASE_DIR = os.getcwd()
11 | 
12 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
13 | 
14 | PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe'
15 | 
16 | # PHANTOMJS_SERVICE = [
17 | #     '--proxy=localhost:8888',
18 | #     '--proxy-type=http',
19 | #     # '--proxy-auth=username:password'
20 | # ]
21 | 
22 | PHANTOMJS_SERVICE = None
23 | 
24 | DRIVER_POOL_SIZE = 5
25 | 
26 | PROXY_PATH_REQUEST = os.path.join(BASE_DIR, 'proxy.txt')
27 | 
28 | REDIS_HOST = 'localhost'
29 | 
30 | REDIS_PORT = 6379
31 | 
32 | 


--------------------------------------------------------------------------------
/sasila/system_instant/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/blueprints/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/blueprints/jd.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | from flask import Blueprint
 6 | from flask import request
 7 | from sasila.system_instant.manager.jd_manager import JdManager
 8 | 
 9 | if sys.version_info < (3, 0):
10 |     reload(sys)
11 |     sys.setdefaultencoding('utf-8')
12 | 
13 | im_jd = Blueprint('im_jd', __name__)
14 | 
15 | jd_manager = JdManager()
16 | 
17 | 
18 | @im_jd.route('/login')
19 | def login():
20 |     return jd_manager.login(request.args['collect_token'], request.args['account'], request.args['password'])
21 | 
22 | 
23 | @im_jd.route('/qrlogin')
24 | def qr_login():
25 |     message = jd_manager.qrlogin(request.args['collect_token'])
26 |     # result = '<image src=\"data:image/png;base64,' + json.loads(message, strict=False)["qr_captcha"] + '\"><br>' + message
27 |     # return result
28 |     return message
29 | 
30 | 
31 | @im_jd.route('/submit_qrlogin')
32 | def submit_qrlogin():
33 |     return jd_manager.submit_qrlogin(request.args['collect_token'])
34 | 


--------------------------------------------------------------------------------
/sasila/system_instant/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/crawler/jd/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/crawler/jd/request.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import json
  4 | import sys
  5 | import time
  6 | import requests
  7 | from bs4 import BeautifulSoup as bs
  8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool
  9 | from sasila.system_normal.utils.cookie import formart_selenium_cookies
 10 | from sasila.system_normal.utils import logger
 11 | from sasila.system_normal.utils import jd_code
 12 | 
 13 | if sys.version_info < (3, 0):
 14 |     reload(sys)
 15 |     sys.setdefaultencoding('utf-8')
 16 | 
 17 | 
 18 | def abstract(text, start, end):
 19 |     if text is None or text == '':
 20 |         return ''
 21 |     res = ''
 22 |     if start is not None and start != '':
 23 |         if start not in text:
 24 |             return res
 25 |         else:
 26 |             text = text[text.index(start) + len(start):]
 27 |     if end is not None and end != '':
 28 |         if end not in text:
 29 |             return res
 30 |         else:
 31 |             res = text[0:text.index(end)]
 32 |     else:
 33 |         res = text
 34 |     return res
 35 | 
 36 | 
 37 | class JdMessage(object):
 38 |     def __init__(self):
 39 |         self.code = ""
 40 |         self.code_description = ""
 41 |         self.cookies = ""
 42 |         self.qr_captcha = ""
 43 | 
 44 | 
 45 | class JdRequest(object):
 46 |     def __init__(self):
 47 |         self.web_driver_pool = None  # type:  Queue
 48 | 
 49 |     def init_pool(self):
 50 |         logger.info('init web driver pool...')
 51 |         self.web_driver_pool = get_web_driver_pool(1)
 52 |         logger.info('init web driver pool success...')
 53 | 
 54 |     def login(self, account, password):
 55 |         message = JdMessage()
 56 | 
 57 |         web = self.web_driver_pool.get()  # type: webdriver.PhantomJS
 58 |         web.delete_all_cookies()
 59 | 
 60 |         web.get("https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F")
 61 |         element = web.find_element_by_css_selector("div.login-tab.login-tab-r").find_element_by_css_selector("a")
 62 |         element.click()
 63 |         element = web.find_element_by_id("loginname")
 64 |         element.clear()
 65 |         element.send_keys(account)
 66 |         element = web.find_element_by_id("nloginpwd")
 67 |         element.clear()
 68 |         element.send_keys(password)
 69 |         element = web.find_element_by_css_selector("a#loginsubmit")
 70 |         element.click()
 71 |         time.sleep(3)
 72 | 
 73 |         if '我的京东' in bs(web.execute_script("return document.documentElement.outerHTML"), 'lxml').title.string:
 74 |             message.code = jd_code.SUCCESS
 75 |             message.code_description = "登录成功"
 76 |             message.cookies = formart_selenium_cookies(web.get_cookies())
 77 |         else:
 78 |             # 需要手机验证码等等状况
 79 |             pass
 80 | 
 81 |         self.web_driver_pool.put(web)
 82 |         return message
 83 | 
 84 |     def qr_login(self):
 85 |         message = JdMessage()
 86 |         headers = dict()
 87 |         headers[
 88 |             "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
 89 |         headers["Accept"] = "*/*"
 90 |         headers["Accept-Encoding"] = "gzip, deflate"
 91 |         headers["Accept-Language"] = "zh-CN,en,*"
 92 |         headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F"
 93 |         session = requests.Session()
 94 |         response = session.get("https://qr.m.jd.com/show?appid=133&size=147&t=" + str(time.time()))
 95 | 
 96 |         message.code = jd_code.SUCCESS
 97 |         message.qr_captcha = response.content.encode("base64")
 98 |         message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape")
 99 |         return message
100 | 
101 |     def submit_qrlogin(self, cookies):
102 |         message = JdMessage()
103 | 
104 |         headers = dict()
105 |         headers[
106 |             "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
107 |         headers["Accept"] = "*/*"
108 |         headers["Accept-Encoding"] = "gzip, deflate"
109 |         headers["Accept-Language"] = "zh-CN,en,*"
110 |         headers["Referer"] = "https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fhome.jd.com%2F"
111 |         session = requests.Session()
112 | 
113 |         response = session.get("https://qr.m.jd.com/check?callback=jQuery6172296&appid=133&_=1486609849337",
114 |                                cookies=json.loads(cookies),
115 |                                headers=headers)
116 | 
117 |         ticket = abstract(response.content, '\"ticket\" : \"', '\"')
118 | 
119 |         headers['X-Requested-With'] = 'XMLHttpRequest'
120 |         response = session.get("https://passport.jd.com/uc/qrCodeTicketValidation?t=" + ticket, headers=headers)
121 | 
122 |         message.code = jd_code.SUCCESS
123 |         message.code_description = "登录成功"
124 |         message.cookies = json.dumps(session.cookies.get_dict()).decode("unicode-escape")
125 | 
126 |         return message
127 | 


--------------------------------------------------------------------------------
/sasila/system_instant/database/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/database/jd_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from sqlalchemy import Column, Integer, String, DateTime, create_engine
 5 | from sqlalchemy.orm import sessionmaker
 6 | from sqlalchemy.ext.declarative import declarative_base
 7 | 
 8 | if sys.version_info < (3, 0):
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | 
12 | # 创建对象的基类:
13 | Base = declarative_base()
14 | 
15 | 
16 | class Process(Base):
17 |     # 表的名字:
18 |     __tablename__ = 'crawler_flow_info'
19 |     # 表的结构:
20 |     collect_token = Column(String(100), primary_key=True)
21 |     customer_id = Column(String(100))
22 |     token_valid_time = Column(Integer)
23 |     token_create_time = Column(Integer)
24 |     status = Column(String(10))
25 |     cookies = Column(String(5000))
26 | 
27 | 
28 | class JdDatabase(object):
29 |     def __init__(self):
30 |         # 初始化数据库连接:
31 |         self.engine = create_engine('mysql+mysqlconnector://root:root@192.168.3.210:3306/hiveengine')
32 |         # 创建DBSession类型:
33 |         self.DBSession = sessionmaker(bind=self.engine)
34 |         self._create_all()
35 | 
36 |     def _create_all(self):
37 |         '''
38 |         创建从Base派生的所有表,如果数据表存在则忽视
39 |         :return:
40 |         '''
41 |         Base.metadata.create_all(self.engine)
42 | 
43 |     def _drop_all(self):
44 |         '''
45 |         删除DB中所有的表
46 |         :return:
47 |         '''
48 |         Base.metadata.drop_all(self.engine)
49 | 
50 |     def create_session(self):
51 |         return self.DBSession()
52 | 
53 |     def query_cookie(self, collect_token):
54 |         session = self.DBSession()
55 |         cookies = session.query(Process).filter(Process.collect_token == collect_token).first().cookies
56 |         session.close()
57 |         return cookies
58 | 
59 |     def update_cookie(self, collect_token, cookies):
60 |         session = self.DBSession()
61 |         session.query(Process).filter(Process.collect_token == collect_token).update({
62 |             Process.cookies: cookies
63 |         })
64 |         session.close()
65 | 


--------------------------------------------------------------------------------
/sasila/system_instant/manager/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_instant/manager/jd_manager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import datetime
 4 | from sasila.system_normal.utils import jd_code
 5 | import json
 6 | from sasila.system_instant.crawler.jd.request import JdRequest
 7 | from sasila.system_instant.database.jd_database import *
 8 | 
 9 | if sys.version_info < (3, 0):
10 |     reload(sys)
11 |     sys.setdefaultencoding('utf-8')
12 | 
13 | 
14 | class JdResponse(object):
15 |     def __init__(self, code, code_description, qr_captcha=None):
16 |         self.code = code
17 |         self.code_description = code_description
18 |         self.qr_captcha = qr_captcha
19 | 
20 | 
21 | class JdManager(object):
22 |     def __init__(self):
23 |         self.database = JdDatabase()
24 |         self.request = JdRequest()
25 |         self.request.init_pool()
26 | 
27 |     def login(self, collect_token, account, password):
28 |         message = self.request.login(account, password)
29 |         if message.code == jd_code.SUCCESS:
30 |             self.database.update_cookie(collect_token, message.cookies)
31 |         return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode(
32 |                 'unicode-escape')
33 | 
34 |     def qrlogin(self, collect_token):
35 |         message = self.request.qr_login()
36 |         if message.code == jd_code.SUCCESS:
37 |             self.database.update_cookie(collect_token, message.cookies)
38 |         return json.dumps(JdResponse(code=message.code, code_description=message.code_description,
39 |                                      qr_captcha=message.qr_captcha).__dict__).decode(
40 |                 'unicode-escape')
41 | 
42 |     def submit_qrlogin(self, collect_token):
43 |         cookies = self.database.query_cookie(collect_token)
44 |         message = self.request.submit_qrlogin(cookies)
45 |         if message.code == jd_code.SUCCESS:
46 |             self.database.update_cookie(collect_token, message.cookies)
47 |         return json.dumps(JdResponse(code=message.code, code_description=message.code_description).__dict__).decode(
48 |                 'unicode-escape')
49 | 


--------------------------------------------------------------------------------
/sasila/system_instant/settings.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/blueprints/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/blueprints/slow_spiders.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from flask import Blueprint, request
 5 | from sasila.system_normal.manager import manager
 6 | import json
 7 | 
 8 | if sys.version_info < (3, 0):
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | 
12 | slow_spider = Blueprint('slow_spider', __name__)
13 | 
14 | 
15 | @slow_spider.route('/all')
16 | def get_all_spider():
17 |     return json.dumps(manager.get_all_spider())
18 | 
19 | 
20 | @slow_spider.route('/find')
21 | def find_spider(spider_id):
22 |     return json.dumps(manager.find_spider(spider_id))
23 | 
24 | 
25 | @slow_spider.route('/start')
26 | def start_spider():
27 |     spider_id = request.args['spider_id']
28 |     manager.start_spider(spider_id)
29 |     return 'start success:' + spider_id
30 | 
31 | 
32 | @slow_spider.route('/restart')
33 | def restart_spider():
34 |     spider_id = request.args['spider_id']
35 |     manager.stop_spider(spider_id)
36 |     manager.restart_spider(spider_id)
37 |     return 'restart success:' + spider_id
38 | 
39 | 
40 | @slow_spider.route('/stop')
41 | def stop_spider():
42 |     spider_id = request.args['spider_id']
43 |     manager.stop_spider(request.args['spider_id'])
44 |     return 'stop success:' + spider_id
45 | 
46 | 
47 | @slow_spider.route('/detail')
48 | def get_spider_detail():
49 |     return manager.get_spider_detail(request.args['spider_id'])
50 | 
51 | 
52 | @slow_spider.route('/init')
53 | def init_system():
54 |     return json.dumps(manager.init_system())
55 | 


--------------------------------------------------------------------------------
/sasila/system_normal/database/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/base_downloder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if sys.version_info < (3, 0):
 6 |     reload(sys)
 7 |     sys.setdefaultencoding('utf-8')
 8 | 
 9 | 
10 | class BaseDownLoader(object):
11 |     def __init__(self):
12 |         self.loginer = None
13 | 
14 |     def download(self, request):
15 |         pass
16 | 
17 |     def set_loginer(self, loginer):
18 |         self.loginer = loginer
19 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/selenium_response.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import re
 5 | from posixpath import normpath
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 |     from urlparse import urljoin, urlparse, urlunparse
11 | else:
12 |     from urllib.parse import urljoin, urlparse, urlunparse
13 | 
14 | 
15 | class SeleniumResponse(object):
16 |     def __init__(self, m_response=None, request=None):
17 |         self.request = request
18 |         self.m_response = m_response
19 | 
20 |     def __str__(self):
21 |         if self.m_response:
22 |             return "<SeleniumResponse [%s] [%.2f KB]>" % (self.request.url, (float(len(self.m_response.content)) / 1000))
23 |         else:
24 |             return "<SeleniumResponse failed: %s>" % self.request.url
25 | 
26 |     def nice_join(self, url):
27 |         url1 = urljoin(self.request.url, url)
28 |         arr = urlparse(url1)
29 |         path = normpath(arr[2])
30 |         return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
31 | 
32 |     def is_url(self, url):
33 |         if re.match(r'^https?:/{2}\w.+$', url):
34 |             return True
35 |         else:
36 |             return False
37 | 
38 |     __repr__ = __str__
39 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/spider_request.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if sys.version_info < (3, 0):
 6 |     reload(sys)
 7 |     sys.setdefaultencoding('utf-8')
 8 | 
 9 | 
10 | class Request(object):
11 |     def __init__(self, url=None, data=None, json=None, headers=None, method="GET", cookies=None, meta=None,
12 |                  callback=None,
13 |                  errback=None, priority=0, allow_redirects=True, timeout=5, duplicate_remove=True):
14 |         self.url = url
15 |         self.data = data
16 |         self.json = json
17 |         self.headers = headers
18 |         self.method = method
19 |         self.allow_redirects = allow_redirects
20 |         if not meta:
21 |             self.meta = {}
22 |         else:
23 |             self.meta = meta
24 |         self.cookies = cookies
25 |         self.callback = callback
26 |         self.priority = priority
27 |         self.duplicate_remove = duplicate_remove
28 |         self.timeout = timeout
29 |         self.errback = errback
30 | 
31 |     def __str__(self):
32 |         return "<Request [%s] [%s]>" % (self.method, self.url)
33 | 
34 |     __repr__ = __str__
35 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/http/spider_response.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import re
 5 | from posixpath import normpath
 6 | 
 7 | from requests.models import Response as Response_name
 8 | 
 9 | if sys.version_info < (3, 0):
10 |     reload(sys)
11 |     sys.setdefaultencoding('utf-8')
12 |     from urlparse import urljoin, urlparse, urlunparse
13 | else:
14 |     from urllib.parse import urljoin, urlparse, urlunparse
15 | 
16 | 
17 | class Response(object):
18 |     def __init__(self, m_response=None, request=None):
19 |         self.request = request
20 |         self.m_response = m_response
21 | 
22 |     def __str__(self):
23 |         if isinstance(self.m_response, Response_name):
24 |             if self.m_response:
25 |                 return "<Response [%s] [%s] [%.2f KB]>" % (
26 |                     self.m_response.status_code, self.m_response.url, (float(len(self.m_response.content)) / 1000))
27 |             else:
28 |                 return "<Response failed: %s>" % self.request.url
29 |         else:
30 |             return "<Selenium Response: %s>" % self.request.url
31 | 
32 |     def nice_join(self, url):
33 |         url1 = urljoin(self.request.url, url)
34 |         arr = urlparse(url1)
35 |         path = normpath(arr[2])
36 |         return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
37 | 
38 |     def is_url(self, url):
39 |         if re.match(r'^https?:/{2}\w.+$', url):
40 |             return True
41 |         else:
42 |             return False
43 | 
44 |     __repr__ = __str__
45 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/proxy/proxy_pool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from sasila.settings.default_settings import PROXY_PATH_REQUEST
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     import Queue
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | else:
12 |     from queue import Queue
13 | 
14 | 
15 | class ProxyPool(object):
16 |     def __init__(self):
17 |         self.queue = Queue.Queue()
18 |         with open(PROXY_PATH_REQUEST, 'r') as f:
19 |             lines = f.readlines()
20 |             self.len = len(lines)
21 |             for line in lines:
22 |                 info = line.strip().split(',')
23 |                 proxy = {}
24 |                 if len(info) == 2:
25 |                     proxy = {"http": "http://%s:%s" % (info[0], info[1]),
26 |                              "https": "http://%s:%s" % (info[0], info[1])}
27 |                 elif len(info) == 4:
28 |                     proxy = {"http": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1]),
29 |                              "https": "http://%s:%s@%s:%s/" % (info[2], info[3], info[0], info[1])}
30 |                 self.queue.put(proxy)
31 | 
32 |     def __len__(self):
33 |         return self.len
34 | 
35 |     def getProxy(self):
36 |         proxy = self.queue.get()
37 |         self.queue.put(proxy)
38 |         return proxy
39 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/requests_downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | import grequests
  5 | import requests
  6 | from requests.adapters import HTTPAdapter
  7 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader
  8 | from sasila.system_normal.downloader.http.spider_response import Response
  9 | from sasila.system_normal.downloader.proxy.proxy_pool import ProxyPool
 10 | 
 11 | from sasila.system_normal.utils import logger
 12 | 
 13 | if sys.version_info < (3, 0):
 14 |     reload(sys)
 15 |     sys.setdefaultencoding('utf-8')
 16 | 
 17 | 
 18 | class RequestsDownLoader(BaseDownLoader):
 19 |     # proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",}
 20 | 
 21 |     def __init__(self, loginer=None, use_proxy=False):
 22 |         self.loginer = loginer
 23 |         self.use_proxy = use_proxy
 24 |         if use_proxy:
 25 |             self.proxy_pool = ProxyPool()
 26 |             if len(self.proxy_pool) == 0:
 27 |                 self.use_proxy = False
 28 |         self._cookies = None
 29 | 
 30 |         self._headers = dict()
 31 |         self._headers[
 32 |             "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
 33 |         self._headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
 34 |         self._headers["Accept-Encoding"] = "gzip, deflate, sdch"
 35 |         self._headers["Accept-Language"] = "zh-CN,zh;q=0.8"
 36 |         self._request_retry = HTTPAdapter(max_retries=3)
 37 | 
 38 |         cookie_dict = dict()
 39 |         self._cookies = cookie_dict
 40 | 
 41 |     def init_loginer(self, account, password):
 42 |         self._cookies = self.loginer.logint(account, password)
 43 | 
 44 |     def download(self, batch):
 45 |         batch_requests = []
 46 | 
 47 |         for request in batch:
 48 |             session = requests.session()
 49 |             session.mount('https://', self._request_retry)
 50 |             session.mount('http://', self._request_retry)
 51 | 
 52 |             if not request.headers:
 53 |                 request.headers = self._headers
 54 |                 session.headers = self._headers
 55 | 
 56 |             if request.method.upper() == "GET":
 57 |                 if self.use_proxy:
 58 |                     m_proxies = self.proxy_pool.getProxy()
 59 |                     batch_requests.append(grequests.get(
 60 |                             session=session,
 61 |                             url=request.url,
 62 |                             headers=request.headers,
 63 |                             cookies=self._cookies,
 64 |                             verify=False,
 65 |                             allow_redirects=request.allow_redirects,
 66 |                             timeout=request.timeout,
 67 |                             proxies=m_proxies
 68 |                     ))
 69 |                 else:
 70 |                     batch_requests.append(grequests.get(
 71 |                             session=session,
 72 |                             url=request.url,
 73 |                             headers=request.headers,
 74 |                             cookies=self._cookies,
 75 |                             verify=False,
 76 |                             allow_redirects=request.allow_redirects,
 77 |                             timeout=request.timeout
 78 |                     ))
 79 |             elif request.method.upper() == "POST":
 80 |                 if self.use_proxy:
 81 |                     m_proxies = self.proxy_pool.getProxy()
 82 |                     batch_requests.append(grequests.post(
 83 |                             session=session,
 84 |                             url=request.url,
 85 |                             data=request.data,
 86 |                             json=request.json,
 87 |                             headers=request.headers,
 88 |                             cookies=self._cookies,
 89 |                             verify=False,
 90 |                             allow_redirects=request.allow_redirects,
 91 |                             timeout=request.timeout,
 92 |                             proxies=m_proxies
 93 |                     ))
 94 |                 else:
 95 |                     batch_requests.append(grequests.post(
 96 |                             session=session,
 97 |                             url=request.url,
 98 |                             data=request.data,
 99 |                             json=request.json,
100 |                             headers=request.headers,
101 |                             cookies=self._cookies,
102 |                             verify=False,
103 |                             allow_redirects=request.allow_redirects,
104 |                             timeout=request.timeout
105 |                     ))
106 |             else:
107 |                 pass
108 | 
109 |         rets = grequests.map(batch_requests, exception_handler=exception_handler)
110 | 
111 |         true_responses = []
112 |         index = 0
113 |         for ret in rets:
114 |             true_response = Response(
115 |                     m_response=ret,
116 |                     request=batch[index],
117 |             )
118 |             true_responses.append(true_response)
119 |             logger.info(true_response)
120 |             index += 1
121 | 
122 |         return true_responses
123 | 
124 | 
125 | def exception_handler(request, exception):
126 |     logger.error("%s %s" % (request.url, exception))
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     proxies = {"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",}
131 |     requests.post(url="http://www.jd.com", data={"123": "fdsgs"})
132 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/selenium_downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from sasila.settings import default_settings
 6 | from sasila.system_normal.downloader.base_downloder import BaseDownLoader
 7 | from sasila.system_normal.downloader.http.selenium_response import SeleniumResponse
 8 | from sasila.system_normal.downloader.web_driver_pool import get_web_driver_pool
 9 | from sasila.system_normal.utils import logger
10 | from multiprocessing.pool import ThreadPool as Pool
11 | 
12 | if sys.version_info < (3, 0):
13 |     reload(sys)
14 |     sys.setdefaultencoding('utf-8')
15 | 
16 | 
17 | class SeleniumDownLoader(BaseDownLoader):
18 |     def __init__(self, driver_pool_size=None):
19 |         self.driver_pool_size = driver_pool_size
20 |         logger.info("init web driver pool...")
21 |         if driver_pool_size:
22 |             self.web_driver_pool = get_web_driver_pool(driver_pool_size)
23 |         else:
24 |             self.web_driver_pool = get_web_driver_pool(default_settings.DRIVER_POOL_SIZE)
25 |         logger.info("init web driver pool success")
26 | 
27 |     def download_one(self, request):
28 |         web = self.web_driver_pool.get()  # type:WebDriver
29 |         web.get(request.url)
30 |         m_response = m_object()
31 |         m_response.content = web.execute_script("return document.documentElement.outerHTML")
32 |         response = SeleniumResponse(m_response=m_response, request=request)
33 |         self.web_driver_pool.put(web)
34 |         return response
35 | 
36 |     def download(self, batch):
37 |         if self.driver_pool_size:
38 |             pool = Pool(processes=self.driver_pool_size)
39 |         else:
40 |             pool = Pool(processes=default_settings.DRIVER_POOL_SIZE)
41 | 
42 |         results = []
43 | 
44 |         for request in batch:
45 |             results.append(pool.apply_async(self.download_one, (request,)))
46 |         pool.close()
47 |         pool.join()
48 | 
49 |         true_responses = []
50 |         for result in results:
51 |             true_response = result.get()
52 |             true_responses.append(true_response)
53 |             logger.info(true_response)
54 | 
55 |         return true_responses
56 | 
57 | 
58 | class m_object(object):
59 |     pass
60 | 


--------------------------------------------------------------------------------
/sasila/system_normal/downloader/web_driver_pool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | 
 6 | from selenium import webdriver
 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 8 | 
 9 | from sasila.settings import default_settings
10 | 
11 | if sys.version_info < (3, 0):
12 |     import Queue
13 |     reload(sys)
14 |     sys.setdefaultencoding('utf-8')
15 | else:
16 |     from queue import Queue
17 | 
18 | dcap = dict(DesiredCapabilities.PHANTOMJS)
19 | dcap["phantomjs.page.settings.resourceTimeout"] = 10
20 | dcap["phantomjs.page.settings.loadImages"] = True
21 | dcap["phantomjs.page.settings.userAgent"] = default_settings.USER_AGENT
22 | 
23 | 
24 | def _get_base_driver():
25 |     if default_settings.PHANTOMJS_SERVICE:
26 |         web = webdriver.PhantomJS(service_args=default_settings.PHANTOMJS_SERVICE, executable_path=default_settings.PHANTOMJS_PATH
27 |                                   , desired_capabilities=dcap)
28 |     else:
29 |         web = webdriver.PhantomJS(executable_path=default_settings.PHANTOMJS_PATH
30 |                                   , desired_capabilities=dcap)
31 |     return web
32 | 
33 | 
34 | def get_web_driver_pool(num):
35 |     driver_queue = Queue.Queue()
36 |     i = 0
37 |     while i < num:
38 |         web = _get_base_driver()
39 |         driver_queue.put(web)
40 |         i += 1
41 |     return driver_queue
42 | 


--------------------------------------------------------------------------------
/sasila/system_normal/loginer/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/loginer/base_loginer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if sys.version_info < (3, 0):
 6 |     reload(sys)
 7 |     sys.setdefaultencoding('utf-8')
 8 | 
 9 | 
10 | class BaseLoginer(object):
11 |     def login(self, account, password):
12 |         cookies = ""
13 |         return cookies
14 | 


--------------------------------------------------------------------------------
/sasila/system_normal/loginer/jd_loginer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/manager/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from sasila.system_normal.manager.spider_manager import SpiderManager
 4 | import sys
 5 | 
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 | 
10 | manager = SpiderManager()
11 | 


--------------------------------------------------------------------------------
/sasila/system_normal/manager/spider_manager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | import threading
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 | 
11 | 
12 | class SpiderManager(object):
13 |     def __init__(self):
14 |         self.spider_list = dict()
15 | 
16 |     def set_spider(self, spider):
17 |         self.spider_list[spider._spider_id] = spider
18 | 
19 |     def del_spider(self, spider_id):
20 |         if spider_id in self.spider_list.keys():
21 |             self.spider_list[spider_id].stop()
22 |             del self.spider_list[spider_id]
23 | 
24 |     def init_system(self):
25 |         pass
26 | 
27 |     def get_all_spider(self):
28 |         return json.dumps(self.spider_list.keys())
29 | 
30 |     def find_spider(self, spider_id):
31 |         pass
32 | 
33 |     def start_spider(self, spider_id):
34 |         if self.spider_list[spider_id]._spider_status == "stopped":
35 |             thread = threading.Thread(target=self.spider_list[spider_id].start)
36 |             thread.setDaemon(True)
37 |             thread.start()
38 | 
39 |     def restart_spider(self, spider_id):
40 |         thread = threading.Thread(target=self.spider_list[spider_id].restart)
41 |         thread.setDaemon(True)
42 |         thread.start()
43 | 
44 |     def stop_spider(self, spider_id):
45 |         self.spider_list[spider_id].stop()
46 | 
47 |     def get_spider_detail(self, spider_id):
48 |         return str(self.spider_list[spider_id]._process_count)
49 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/base_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if sys.version_info < (3, 0):
 6 |     reload(sys)
 7 |     sys.setdefaultencoding('utf-8')
 8 | 
 9 | 
10 | class ItemPipeline(object):
11 |     def process_item(self, item):
12 |         raise NotImplementedError
13 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/console_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
 5 | import json
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 | 
11 | 
12 | class ConsolePipeline(ItemPipeline):
13 |     def process_item(self, item):
14 |         if sys.version_info < (3, 0):
15 |             print(json.dumps(item).decode("unicode-escape"))
16 |         else:
17 |             print(json.dumps(item).encode('utf8').decode("unicode-escape"))
18 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/kafa_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # import sys
 4 | # from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
 5 | # import json
 6 | # from sasila.system_normal.utils.kafka_utils import send_message
 7 | #
 8 | # reload(sys)
 9 | # sys.setdefaultencoding('utf-8')
10 | #
11 | #
12 | # class KafkaPipeline(ItemPipeline):
13 | #     def process_item(self, item):
14 | #         send_message("dataCollectionTopic", bytes("CompanyConsummer__" + json.dumps(item).decode("unicode-escape")))
15 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/pic_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import sys
 5 | import uuid
 6 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
 7 | 
 8 | if sys.version_info < (3, 0):
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | 
12 | 
13 | class PicPipeline(ItemPipeline):
14 |     def process_item(self, item):
15 |         if item is not None:
16 |             if not os.path.exists("img"):
17 |                 os.mkdir("img")
18 |             with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
19 |                 fs.write(item)
20 |                 print("download success!")
21 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/pipe_item.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | if sys.version_info < (3, 0):
 5 |     reload(sys)
 6 |     sys.setdefaultencoding('utf-8')
 7 | 
 8 | 
 9 | class pipeItem(object):
10 |     def __init__(self, pipenames=[], result=None):
11 |         self.pipenames = pipenames
12 |         self.result = result
13 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
 5 | 
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 | 
10 | 
11 | class TestPipeline(ItemPipeline):
12 |     def __init__(self):
13 |         self.result = {}
14 | 
15 |     def process_item(self, item):
16 |         self.result = item
17 | 


--------------------------------------------------------------------------------
/sasila/system_normal/pipeline/text_pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | from sasila.system_normal.pipeline.base_pipeline import ItemPipeline
  5 | from sasila.system_normal.utils import logger
  6 | import traceback
  7 | import codecs
  8 | 
  9 | if sys.version_info < (3, 0):
 10 |     reload(sys)
 11 |     sys.setdefaultencoding('utf-8')
 12 | 
 13 | 
 14 | class TextPipeline(ItemPipeline):
 15 |     def process_item(self, item):
 16 |         with open("result.txt", 'a') as f:
 17 |             f.write(
 18 |                     item["province"] + ',' +
 19 |                     item["city"] + ',' +
 20 |                     item["company_name"] + ',' +
 21 |                     item["company_man"] + ',' +
 22 |                     item["company_telephone"] + ',' +
 23 |                     item["company_address"] + ',' +
 24 |                     item["company_registered_capital"] + ',' +
 25 |                     item["company_registered_time"] + ',' +
 26 |                     item["company_status"] + ',' +
 27 |                     item["source"] + ',' +
 28 |                     item["update_time"] + "\n"
 29 |             )
 30 | 
 31 | 
 32 | class TextPipelineCar(ItemPipeline):
 33 |     def process_item(self, item):
 34 |         try:
 35 |             with codecs.open("result.csv", 'a', 'gbk') as f:
 36 |                 f.write(
 37 |                         item["province"] + ',' +
 38 |                         item["city"] + ',' +
 39 |                         item["brand"].replace(u'\u30fb', '·') + ',' +
 40 |                         item["cars_line"].replace(u'\u30fb', '·') + ',' +
 41 |                         item["car"].replace(u'\u30fb', '·') + ',' +
 42 |                         item["mileage"] + ',' +
 43 |                         item["first_borad_date"] + ',' +
 44 |                         item["gear"] + ',' +
 45 |                         item["displacement"] + ',' +
 46 |                         item["price"] + ',' +
 47 |                         item["crawl_date"] + "\n"
 48 |                 )
 49 |         except:
 50 |             logger.error(traceback.format_exc())
 51 | 
 52 | 
 53 | class TextPipelineFang(ItemPipeline):
 54 |     def process_item(self, item):
 55 |         try:
 56 |             with codecs.open("fang.csv", 'a', 'gbk') as f:
 57 |                 f.write(
 58 |                         item["province"] + ',' +
 59 |                         item["city"] + ',' +
 60 |                         item["district"] + ',' +
 61 |                         item["avg_price"] + ',' +
 62 |                         item["estate"].replace(',', '，') + ',' +
 63 |                         item["area"] + ',' +
 64 |                         item["layout"] + ',' +
 65 |                         item["total_price"] + ',' +
 66 |                         item["crawl_date"] + ',' +
 67 |                         item["url"] + "\n"
 68 |                 )
 69 |         except:
 70 |             logger.error(traceback.format_exc())
 71 | 
 72 | 
 73 | class TextPipelineFangShop(ItemPipeline):
 74 |     def process_item(self, item):
 75 |         try:
 76 |             with codecs.open("fang_shop.csv", 'a', 'gbk') as f:
 77 |                 f.write(
 78 |                         item["city"] + ',' +
 79 |                         item["district"] + ',' +
 80 |                         item["estate"].replace(',', '，') + ',' +
 81 |                         item["floor"] + ',' +
 82 |                         item["total_floor"] + ',' +
 83 |                         item["type"] + ',' +
 84 |                         item["area"] + ',' +
 85 |                         item["total_price"] + ',' +
 86 |                         item["crawl_date"] + ',' +
 87 |                         item["url"] + "\n"
 88 |                 )
 89 |         except:
 90 |             logger.error(traceback.format_exc())
 91 | 
 92 | 
 93 | class TextPipelineBendibao(ItemPipeline):
 94 |     def process_item(self, item):
 95 |         try:
 96 |             with codecs.open("bendibao.csv", 'a', 'gbk') as f:
 97 |                 f.write(
 98 |                         item["city_name"] + ',' +
 99 |                         item["category1_name"] + ',' +
100 |                         item["category2_name"] + ',' +
101 |                         item["result_name"] + ',' +
102 |                         item["result_mobile"] + "\n"
103 |                 )
104 |         except:
105 |             logger.error(traceback.format_exc())
106 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/processor/base_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import re
 5 | from bs4 import BeautifulSoup as bs
 6 | from sasila.system_normal.downloader.http.spider_request import Request
 7 | from sasila.system_normal.utils.decorator import checkResponse
 8 | 
 9 | if sys.version_info < (3, 0):
10 |     reload(sys)
11 |     sys.setdefaultencoding('utf-8')
12 | 
13 | 
14 | def identity(x):
15 |     return x
16 | 
17 | 
18 | class Rule(object):
19 |     def __init__(self, link_extractor, callback=None, process_request=identity, priority=0, only_first=False):
20 |         self.link_extractor = link_extractor
21 |         self.callback = callback
22 |         self.process_request = process_request
23 |         self.priority = priority
24 |         self.only_first = only_first
25 | 
26 | 
27 | class LinkExtractor(object):
28 |     def __init__(self, regex_str=None, css_str=None, process_value=None):
29 |         if regex_str:
30 |             self.regex = re.compile(regex_str)
31 |         else:
32 |             self.regex = None
33 |         self.css_str = css_str
34 |         self.process_value = process_value
35 | 
36 |     @checkResponse
37 |     def extract_links(self, response):
38 |         if self.process_value:
39 |             return [response.nice_join(link) for link in self.process_value(response.m_response.content)]
40 |         elif self.regex:
41 |             return [response.nice_join(link) for link in self.regex.findall(response.m_response.content)]
42 |         elif self.css_str:
43 |             soup = bs(response.m_response.content, 'lxml')
44 |             tags = soup.select(self.css_str)
45 |             return [response.nice_join(tag.attrs["href"]) for tag in tags]
46 | 
47 | 
48 | class BaseProcessor(object):
49 |     spider_id = None
50 |     spider_name = None
51 |     start_requests = []
52 |     rules = ()
53 |     allowed_domains = []
54 | 
55 |     @checkResponse
56 |     def process(self, response):
57 |         if hasattr(self, 'rules'):
58 |             rules = getattr(self, 'rules', None)
59 |         else:
60 |             rules = ()
61 |         for rule in rules:
62 |             links = rule.link_extractor.extract_links(response)
63 |             if links:
64 |                 for link in links:
65 |                     request = Request(url=link, callback=rule.callback, priority=rule.priority)
66 |                     request = rule.process_request(request)
67 |                     yield request
68 |                     if rule.only_first:
69 |                         break
70 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/bendibao_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from bs4 import BeautifulSoup as bs
 6 | from sasila.system_normal.spider.spider_core import SpiderCore
 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
 8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineBendibao
 9 | 
10 | from sasila.system_normal.processor.base_processor import BaseProcessor
11 | from sasila.system_normal.downloader.http.spider_request import Request
12 | from sasila.system_normal.utils.decorator import checkResponse
13 | 
14 | if sys.version_info < (3, 0):
15 |     reload(sys)
16 |     sys.setdefaultencoding('utf-8')
17 | 
18 | start_requests_temp = []
19 | 
20 | with open(name='city.txt', mode='r') as fs:
21 |     lines = fs.readlines()
22 |     for line in lines:
23 |         request_temp = Request(url=line.strip().split(',')[0] + 'wangdian/', priority=0)
24 |         request_temp.meta["city_name"] = line.strip().split(',')[1]
25 |         start_requests_temp.append(request_temp)
26 | 
27 | 
28 | class Bendibao_Processor(BaseProcessor):
29 |     spider_id = 'bendibao_spider'
30 |     spider_name = 'bendibao_spider'
31 |     allowed_domains = ['bendibao.com']
32 |     start_requests = start_requests_temp
33 | 
34 |     @checkResponse
35 |     def process(self, response):
36 |         soup = bs(response.m_response.content, 'lxml')
37 |         category1 = soup.select('div.navlink')
38 |         for category in category1:
39 |             category1_name = category.select('div.title h2')[0].text
40 |             category_2 = category.select('ul.topic li a')
41 |             for category_2_one in category_2:
42 |                 url = response.nice_join(category_2_one['href']) + '/'
43 |                 category_2_name = category_2_one.text
44 |                 request = Request(url=url, priority=1, callback=self.process_page_1)
45 |                 request.meta['city_name'] = response.request.meta['city_name']
46 |                 request.meta['category1_name'] = category1_name
47 |                 request.meta['category2_name'] = category_2_name
48 |                 yield request
49 | 
50 |     @checkResponse
51 |     def process_page_1(self, response):
52 |         if '下暂无网点信息' not in response.m_response.content:
53 |             soup = bs(response.m_response.content, 'lxml')
54 |             results = soup.select('ul.catalist li')
55 |             for result in results:
56 |                 result_name = result.select("div.infoschema h3 a")[0].text
57 |                 result_mobile = result.find(lambda tag: tag.name == 'p' and '电话：' in tag.text).text
58 |                 m_result = dict()
59 |                 m_result['result_name'] = result_name
60 |                 m_result['result_mobile'] = result_mobile.replace('电话：', '')
61 |                 m_result['city_name'] = response.request.meta['city_name']
62 |                 m_result['category1_name'] = response.request.meta['category1_name']
63 |                 m_result['category2_name'] = response.request.meta['city_name']
64 |                 yield m_result
65 |             next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
66 |             if next_page:
67 |                 url_splits = response.request.url.split('/')
68 |                 url_splits[-1] = next_page['href']
69 |                 url = '/'.join(url_splits)
70 |                 request = Request(url=url, priority=1, callback=self.process_page_1)
71 |                 request.meta['city_name'] = response.request.meta['city_name']
72 |                 request.meta['category1_name'] = response.request.meta['category1_name']
73 |                 request.meta['category2_name'] = response.request.meta['category2_name']
74 |                 yield request
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline(TextPipelineBendibao()).set_pipeline(
79 |             ConsolePipeline()).start()
80 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/car_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | 
  5 | from bs4 import BeautifulSoup as bs
  6 | from sasila.system_normal.spider.spider_core import SpiderCore
  7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  8 | 
  9 | from sasila.system_normal.processor.base_processor import BaseProcessor
 10 | from sasila.system_normal.downloader.http.spider_request import Request
 11 | from sasila.system_normal.utils.decorator import checkResponse
 12 | import json
 13 | import time
 14 | 
 15 | if sys.version_info < (3, 0):
 16 |     reload(sys)
 17 |     sys.setdefaultencoding('utf-8')
 18 | 
 19 | 
 20 | class Car_Processor(BaseProcessor):
 21 |     spider_id = 'car_spider'
 22 |     spider_name = 'car_spider'
 23 |     allowed_domains = ['che168.com']
 24 |     start_requests = [Request(url='http://www.che168.com', priority=0)]
 25 | 
 26 |     @checkResponse
 27 |     def process(self, response):
 28 |         soup = bs(response.m_response.content, 'lxml')
 29 |         province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear')
 30 |         for province_div in province_div_list:
 31 |             province_name = province_div.select('span.capital a')[0].text
 32 |             city_list = province_div.select('div.city a')
 33 |             for city in city_list:
 34 |                 city_name = city.text
 35 |                 pinyin = city['href'].strip('/').split('/')[0]
 36 |                 request = Request(
 37 |                         url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin,
 38 |                         priority=1, callback=self.process_page_1)
 39 |                 request.meta['province'] = province_name
 40 |                 request.meta['city'] = city_name
 41 |                 yield request
 42 | 
 43 |     @checkResponse
 44 |     def process_page_1(self, response):
 45 |         brand_list = list(json.loads(response.m_response.content.decode('gb2312')))
 46 |         for brand in brand_list:
 47 |             brand_dict = dict(brand)
 48 |             brand_name = brand_dict['name']
 49 |             url = response.nice_join(brand_dict['url']) + '/'
 50 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 51 |             request.meta['province'] = response.request.meta['province']
 52 |             request.meta['city'] = response.request.meta['city']
 53 |             request.meta['brand'] = brand_name
 54 |             yield request
 55 | 
 56 |     @checkResponse
 57 |     def process_page_2(self, response):
 58 |         soup = bs(response.m_response.content, 'lxml')
 59 |         cars_line_list = soup.select('div#series div.content-area dl.model-list dd a')
 60 |         for cars_line in cars_line_list:
 61 |             cars_line_name = cars_line.text
 62 |             url = 'http://www.che168.com' + cars_line['href']
 63 |             request = Request(url=url, priority=3, callback=self.process_page_3)
 64 |             request.meta['province'] = response.request.meta['province']
 65 |             request.meta['city'] = response.request.meta['city']
 66 |             request.meta['brand'] = response.request.meta['brand']
 67 |             request.meta['cars_line'] = cars_line_name
 68 |             yield request
 69 | 
 70 |     @checkResponse
 71 |     def process_page_3(self, response):
 72 |         soup = bs(response.m_response.content, 'lxml')
 73 |         car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
 74 |         for car_info in car_info_list:
 75 |             url = 'http://www.che168.com' + car_info['href']
 76 |             request = Request(url=url, priority=4, callback=self.process_page_4)
 77 |             request.meta['province'] = response.request.meta['province']
 78 |             request.meta['city'] = response.request.meta['city']
 79 |             request.meta['brand'] = response.request.meta['brand']
 80 |             request.meta['cars_line'] = response.request.meta['cars_line']
 81 |             yield request
 82 |         next_page = soup.find(lambda tag: tag.name == 'a' and '下一页' in tag.text)
 83 |         if next_page:
 84 |             url = 'http://www.che168.com' + next_page['href']
 85 |             request = Request(url=url, priority=3, callback=self.process_page_3)
 86 |             request.meta['province'] = response.request.meta['province']
 87 |             request.meta['city'] = response.request.meta['city']
 88 |             request.meta['brand'] = response.request.meta['brand']
 89 |             request.meta['cars_line'] = response.request.meta['cars_line']
 90 |             yield request
 91 | 
 92 |     @checkResponse
 93 |     def process_page_4(self, response):
 94 |         soup = bs(response.m_response.content, 'lxml')
 95 |         # <html><head><title>Object moved</title></head><body>
 96 |         # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&amp;backurl=/&amp;infoid=21415515">here</a>.</h2>
 97 |         # </body></html>
 98 |         if len(soup.select('div.car-title h2')) != 0:
 99 |             car = soup.select('div.car-title h2')[0].text
100 |             detail_list = soup.select('div.details li')
101 |             if len(detail_list) == 0:
102 |                 soup = bs(response.m_response.content, 'html5lib')
103 |                 detail_list = soup.select('div.details li')
104 |             mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
105 |             first_borad_date = detail_list[1].select('span')[0].text
106 |             gear = detail_list[2].select('span')[0].text.split('／')[0]
107 |             displacement = detail_list[2].select('span')[0].text.split('／')[1]
108 |             price = soup.select('div.car-price ins')[0].text.replace('￥', '')
109 |             crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
110 | 
111 |             item = dict()
112 |             item['car'] = car
113 |             item['mileage'] = mileage
114 |             item['first_borad_date'] = first_borad_date
115 |             item['gear'] = gear
116 |             item['displacement'] = displacement
117 |             item['price'] = price
118 |             item['crawl_date'] = crawl_date
119 | 
120 |             item['province'] = response.request.meta['province']
121 |             item['city'] = response.request.meta['city']
122 |             item['brand'] = response.request.meta['brand']
123 |             item['cars_line'] = response.request.meta['cars_line']
124 |             yield item
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     SpiderCore(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).start()
129 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/city.txt:
--------------------------------------------------------------------------------
 1 | http://cd.bendibao.com/,成都
 2 | http://my.bendibao.com/,绵阳
 3 | http://deyang.bendibao.com/,德阳
 4 | http://nanchong.bendibao.com/,南充
 5 | http://yb.bendibao.com/,宜宾
 6 | http://zg.bendibao.com/,自贡
 7 | http://leshan.bendibao.com/,乐山
 8 | http://luzhou.bendibao.com/,泸州
 9 | http://dazhou.bendibao.com/,达州
10 | http://neijiang.bendibao.com/,内江
11 | http://suining.bendibao.com/,遂宁
12 | http://pzh.bendibao.com/,攀枝花
13 | http://ms.bendibao.com/,眉山
14 | http://ga.bendibao.com/,广安
15 | http://zy.bendibao.com/,资阳
16 | http://liangshan.bendibao.com/,凉山
17 | http://guangyuan.bendibao.com/,广元
18 | http://ya.bendibao.com/,雅安
19 | http://bazhong.bendibao.com/,巴中
20 | http://xichang.bendibao.com/,西昌
21 | http://ab.bendibao.com/,阿坝
22 | http://ganzi.bendibao.com/,甘孜


--------------------------------------------------------------------------------
/sasila/system_normal/processor/city_location_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from sasila.system_normal.spider.spider_core import SpiderCore
 6 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
 7 | from sasila.system_normal.downloader.http.spider_request import Request
 8 | from bs4 import BeautifulSoup as bs
 9 | 
10 | if sys.version_info < (3, 0):
11 |     reload(sys)
12 |     sys.setdefaultencoding('utf-8')
13 | 
14 | 
15 | class CityLocationProcessor(BaseProcessor):
16 |     spider_id = 'city'
17 |     spider_name = 'city'
18 |     allowed_domains = ['supfree.net']
19 |     start_requests = [Request(url='http://jingwei.supfree.net/')]
20 | 
21 |     rules = (
22 |         Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0),
23 |         Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"), priority=1, only_first=True, callback='save'),
24 |     )
25 | 
26 |     def save(self, response):
27 |         if response.m_response:
28 |             soup = bs(response.m_response.content, 'lxml')
29 |             name = soup.select("div.cdiv p")[0].string.strip().split(' ')
30 |             if len(name) > 2:
31 |                 province = name[0]
32 |                 city = name[1]
33 |                 area = name[2]
34 |             elif len(name) > 1:
35 |                 province = name[0]
36 |                 city = name[0]
37 |                 area = name[1]
38 |             else:
39 |                 province = name[0]
40 |                 city = name[0]
41 |                 area = name[0]
42 |             lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
43 |             la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
44 |             data = province + ',' + city + ',' + area + ',' + lo + ',' + la
45 |             print(data)
46 |             with open('city.txt', 'a+') as fs:
47 |                 data = province + ',' + city + ',' + area + ',' + lo + ',' + la
48 |                 fs.write(data + '\n')
49 |                 print(data)
50 | 
51 | 
52 | # fe_spider = SpiderCore(CityLocationProcessor())
53 | # if __name__ == '__main__':
54 | #     fe_spider.start()
55 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/fang_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | 
  5 | from bs4 import BeautifulSoup as bs
  6 | from sasila.system_normal.spider.spider_core import SpiderCore
  7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFang
  9 | 
 10 | from sasila.system_normal.processor.base_processor import BaseProcessor
 11 | from sasila.system_normal.downloader.http.spider_request import Request
 12 | import time
 13 | from sasila.system_normal.utils.decorator import checkResponse
 14 | 
 15 | if sys.version_info < (3, 0):
 16 |     reload(sys)
 17 |     sys.setdefaultencoding('utf-8')
 18 | 
 19 | 
 20 | class Fang_Processor(BaseProcessor):
 21 |     spider_id = 'fang_spider'
 22 |     spider_name = 'fang_spider'
 23 |     allowed_domains = ['fang.com']
 24 |     start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)]
 25 | 
 26 |     @checkResponse
 27 |     def process(self, response):
 28 |         soup = bs(response.m_response.content, 'lxml')
 29 |         province_list = {u'山西'}
 30 |         province_div_list = soup.select('div#c02 ul li')
 31 |         for province_div in province_div_list:
 32 |             province_name = province_div.select('strong')[0].text
 33 |             if province_name != '其他':
 34 |                 if province_name in province_list:
 35 |                     city_list = province_div.select('a')
 36 |                     for city in city_list:
 37 |                         city_name = city.text
 38 |                         url = city['href']
 39 |                         request = Request(url=url, priority=1, callback=self.process_page_1)
 40 |                         request.meta['province'] = province_name
 41 |                         request.meta['city'] = city_name
 42 |                         yield request
 43 | 
 44 |     @checkResponse
 45 |     def process_page_1(self, response):
 46 |         soup = bs(response.m_response.content, 'lxml')
 47 |         district_list = soup.select('div.qxName a')
 48 |         district_list.pop(0)
 49 |         for district in district_list:
 50 |             district_name = district.text
 51 |             url = response.request.url + district['href']
 52 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 53 |             request.meta['province'] = response.request.meta['province']
 54 |             request.meta['city'] = response.request.meta['city']
 55 |             request.meta['district'] = district_name
 56 |             yield request
 57 | 
 58 |     @checkResponse
 59 |     def process_page_2(self, response):
 60 |         soup = bs(response.m_response.content, 'lxml')
 61 |         avg_price_list = soup.select('div.newcardR dl')
 62 |         if len(avg_price_list) > 0:
 63 |             avg_price = avg_price_list[1].select('dd b')[0].text
 64 |         else:
 65 |             avg_price = '未知'
 66 |         detail_list = soup.select('div.houseList dl')
 67 |         for detail in detail_list:
 68 |             if len(detail.select('p.mt10 a span')) != 0:
 69 |                 estate = detail.select('p.mt10 a span')[0].text
 70 |                 area = detail.select('div.area p')[0].text.replace('㎡', '')
 71 |                 layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
 72 |                 total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
 73 |                 crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
 74 |                 item = dict()
 75 |                 item['avg_price'] = avg_price
 76 |                 item['estate'] = estate
 77 |                 item['area'] = area
 78 |                 item['layout'] = layout
 79 |                 item['total_price'] = total_price
 80 |                 item['crawl_date'] = crawl_date
 81 | 
 82 |                 item['province'] = response.request.meta['province']
 83 |                 item['city'] = response.request.meta['city']
 84 |                 item['district'] = response.request.meta['district']
 85 |                 item['url'] = response.request.url
 86 |                 yield item
 87 | 
 88 |         next_page = soup.select('a#PageControl1_hlk_next')
 89 |         if len(next_page) > 0:
 90 |             url = response.nice_join(next_page[0]['href'])
 91 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 92 |             request.meta['province'] = response.request.meta['province']
 93 |             request.meta['city'] = response.request.meta['city']
 94 |             request.meta['district'] = response.request.meta['district']
 95 |             yield request
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     spider = SpiderCore(Fang_Processor(), test=True).set_pipeline(ConsolePipeline()).start()
100 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/fang_shop_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | 
  5 | from bs4 import BeautifulSoup as bs
  6 | from sasila.system_normal.spider.spider_core import SpiderCore
  7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  8 | from sasila.system_normal.pipeline.text_pipeline import TextPipelineFangShop
  9 | 
 10 | from sasila.system_normal.processor.base_processor import BaseProcessor
 11 | from sasila.system_normal.downloader.http.spider_request import Request
 12 | import time
 13 | from sasila.system_normal.utils.decorator import checkResponse
 14 | from sasila.system_normal.utils import logger
 15 | 
 16 | if sys.version_info < (3, 0):
 17 |     reload(sys)
 18 |     sys.setdefaultencoding('utf-8')
 19 | 
 20 | 
 21 | class Fang_Shop_Processor(BaseProcessor):
 22 |     spider_id = 'fang_shop_spider'
 23 |     spider_name = 'fang_shop_spider'
 24 |     allowed_domains = ['fang.com']
 25 |     start_requests = [Request(url='http://shop.fang.com', priority=0)]
 26 | 
 27 |     @checkResponse
 28 |     def process(self, response):
 29 |         city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'}
 30 |         soup = bs('''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a>
 31 |                      <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a>
 32 |                      <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a>
 33 |                      <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a>
 34 |                      <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a>
 35 |                      <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a>
 36 |                      <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a>
 37 |                      <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a>
 38 |                      <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a>
 39 |                      <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a>
 40 |                      <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a>
 41 |                      <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a>
 42 |                      <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a>
 43 |                      <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a>
 44 |                      <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a>
 45 |                      <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a>
 46 |                      <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a>
 47 |                      <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a>
 48 |                      <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a>
 49 |                      <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a>
 50 |                      <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''', 'lxml')
 51 |         city__list = soup.select('a')
 52 |         for city in city__list:
 53 |             city_name = city.text
 54 |             if city_name in city_crawl_list:
 55 |                 url = city['href']
 56 |                 request = Request(url=url, priority=1, callback=self.process_page_1)
 57 |                 request.meta['city'] = city_name
 58 |                 yield request
 59 | 
 60 |     @checkResponse
 61 |     def process_page_1(self, response):
 62 |         soup = bs(response.m_response.content, 'lxml')
 63 |         district_list = soup.select('div.qxName a')
 64 |         district_list.pop(0)
 65 |         for district in district_list:
 66 |             district_name = district.text
 67 |             url = response.request.url + district['href']
 68 |             request = Request(url=url, priority=2, callback=self.process_page_2)
 69 |             request.meta['city'] = response.request.meta['city']
 70 |             request.meta['district'] = district_name
 71 |             yield request
 72 | 
 73 |     @checkResponse
 74 |     def process_page_2(self, response):
 75 |         soup = bs(response.m_response.content, 'lxml')
 76 |         detail_list = soup.select('div.houseList dl')
 77 |         for detail in detail_list:
 78 |             estate = detail.select('p.mt15 span.spName')[0].text
 79 |             detail_str = detail.select('p.mt10')[0].text
 80 | 
 81 |             temp_list = detail.select('p.mt10')[0].text.split('/')
 82 |             temp_list = [temp.strip() for temp in temp_list]
 83 | 
 84 |             if '购物中心/百货' not in detail_str and '层' in detail_str:
 85 |                 m_type = temp_list[0].replace('类型：', '')
 86 |                 floor = temp_list[1]
 87 |                 total_floor = temp_list[2].replace('层', '')
 88 |             elif '购物中心/百货' not in detail_str and '层' not in detail_str:
 89 |                 m_type = temp_list[0].strip().replace('类型：', '')
 90 |                 floor = '未知'
 91 |                 total_floor = '未知'
 92 |             elif '购物中心/百货' in detail_str and '层' not in detail_str:
 93 |                 m_type = temp_list[0].replace('类型：', '') + temp_list[1]
 94 |                 floor = '未知'
 95 |                 total_floor = '未知'
 96 |             elif '购物中心/百货' in detail_str and '层' in detail_str:
 97 |                 m_type = temp_list[0].replace('类型：', '') + temp_list[1]
 98 |                 floor = temp_list[2]
 99 |                 total_floor = temp_list[3].replace('层', '')
100 |             else:
101 |                 logger.error('unexpective detail_str: ' + detail_str.strip())
102 | 
103 |             area = detail.select('div.area')[0].text.replace('㎡', '').replace('建筑面积', '')
104 |             total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
105 |             crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
106 | 
107 |             item = dict()
108 |             item['estate'] = estate
109 |             item['floor'] = floor
110 |             item['total_floor'] = total_floor
111 |             item['type'] = m_type
112 |             item['area'] = area
113 |             item['total_price'] = total_price
114 |             item['crawl_date'] = crawl_date
115 | 
116 |             item['city'] = response.request.meta['city']
117 |             item['district'] = response.request.meta['district']
118 |             item['url'] = response.request.url
119 |             yield item
120 | 
121 |         next_page = soup.select('a#PageControl1_hlk_next')
122 |         if len(next_page) > 0:
123 |             url = response.nice_join(next_page[0]['href']) + '/'
124 |             request = Request(url=url, priority=2, callback=self.process_page_2)
125 |             request.meta['city'] = response.request.meta['city']
126 |             request.meta['district'] = response.request.meta['district']
127 |             yield request
128 | 
129 | 
130 | # if __name__ == '__main__':
131 | #     spider = SpiderCore(Fang_Shop_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(
132 | #             TextPipelineFangShop()).start()
133 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/fe_loan_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from sasila.system_normal.spider.spider_core import SpiderCore
 6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
 7 | 
 8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
 9 | from sasila.system_normal.downloader.http.spider_request import Request
10 | from bs4 import BeautifulSoup as bs
11 | 
12 | if sys.version_info < (3, 0):
13 |     reload(sys)
14 |     sys.setdefaultencoding('utf-8')
15 | 
16 | 
17 | class FeProcessor(BaseProcessor):
18 |     spider_id = 'fe'
19 |     spider_name = 'fe'
20 |     allowed_domains = ['58.com']
21 |     start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]
22 | 
23 |     rules = (
24 |         Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0),
25 |         Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
26 |         Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'),
27 |     )
28 | 
29 |     def save(self, response):
30 |         if response.m_response:
31 |             print(bs(response.m_response.content, 'lxml').title.string)
32 | 
33 | 
34 | # fe_spider = SpiderCore(FeProcessor()).set_pipeline(PicPipeline())
35 | # if __name__ == '__main__':
36 | #     fe_spider.start()
37 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/first_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from bs4 import BeautifulSoup as bs
 6 | from sasila.system_normal.spider.spider_core import SpiderCore
 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
 8 | 
 9 | from sasila.system_normal.processor.base_processor import BaseProcessor
10 | from sasila.system_normal.downloader.http.spider_request import Request
11 | 
12 | if sys.version_info < (3, 0):
13 |     reload(sys)
14 |     sys.setdefaultencoding('utf-8')
15 | 
16 | 
17 | class FirstProcessor(BaseProcessor):
18 |     spider_id = 'test'
19 |     spider_name = 'test'
20 |     allowed_domains = ['mzitu.com']
21 |     start_requests = [Request(url="http://www.mzitu.com/")]
22 | 
23 |     def process(self, response):
24 |         soup = bs(response.m_response.content, 'lxml')
25 |         a_list = soup.select("a")
26 |         for a in a_list:
27 |             if "href" in a.attrs:
28 |                 url = response.nice_join(a["href"])
29 |                 yield {'url': url}
30 | 
31 | # if __name__ == '__main__':
32 | #     spider = SpiderCore(FirstProcessor()).set_pipeline(ConsolePipeline()).start()
33 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/mzitu_proccessor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from bs4 import BeautifulSoup as bs
 6 | from sasila.system_normal.spider.spider_core import SpiderCore
 7 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
 8 | 
 9 | from sasila.system_normal.processor.base_processor import BaseProcessor
10 | from sasila.system_normal.downloader.http.spider_request import Request
11 | 
12 | if sys.version_info < (3, 0):
13 |     reload(sys)
14 |     sys.setdefaultencoding('utf-8')
15 | 
16 | 
17 | class MezituProcessor(BaseProcessor):
18 |     spider_id = 'mzitu'
19 |     spider_name = 'mzitu'
20 |     allowed_domains = ['mzitu.com', 'meizitu.net']
21 |     start_requests = [Request(url='http://www.mzitu.com/xinggan')]
22 | 
23 |     def process(self, response):
24 |         if response.m_response:
25 |             soup = bs(response.m_response.content, "lxml")
26 |             total_page = int(soup.select_one("a.next.page-numbers").find_previous_sibling().text)
27 |             for page in range(1, total_page + 1):
28 |                 yield Request(url="http://www.mzitu.com/xinggan/page/" + str(page), callback=self.get_page_content)
29 | 
30 |     def get_page_content(self, response):
31 |         if response.m_response:
32 |             soup = bs(response.m_response.content, 'lxml')
33 |             li_list = soup.select("div.postlist ul#pins li")
34 |             for li in li_list:
35 |                 yield Request(url=li.select_one("a").attrs["href"], callback=self.get_pic, priority=1)
36 | 
37 |     def get_pic(self, response):
38 |         if response.m_response:
39 |             li_soup = bs(response.m_response.content, "lxml")
40 |             if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) is not None:
41 |                 total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
42 |                                  .find_previous_sibling().text)
43 |                 for page in range(1, total_page + 1):
44 |                     yield Request(url=response.request.url + "/" + str(page), callback=self.download_pic, priority=2)
45 | 
46 |     def download_pic(self, response):
47 |         if response.m_response:
48 |             href = bs(response.m_response.content, "lxml").select_one("div.main-image img").attrs["src"]
49 |             yield Request(url=href, callback=self.download, priority=3)
50 | 
51 |     def download(self, response):
52 |         if response.m_response:
53 |             if response.m_response.status_code == 200:
54 |                 yield response.m_response.content
55 | 
56 | 
57 | # mzitu_spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline())
58 | #
59 | # if __name__ == '__main__':
60 | #     spider = SpiderCore(MezituProcessor()).set_pipeline(PicPipeline()).start()
61 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/mzitu_proccessor_regex.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | from sasila.system_normal.spider.spider_core import SpiderCore
 6 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
 7 | 
 8 | from sasila.system_normal.processor.base_processor import BaseProcessor, Rule, LinkExtractor
 9 | from sasila.system_normal.downloader.http.spider_request import Request
10 | import os
11 | import uuid
12 | 
13 | if sys.version_info < (3, 0):
14 |     reload(sys)
15 |     sys.setdefaultencoding('utf-8')
16 | 
17 | 
18 | class MezituProcessor(BaseProcessor):
19 |     spider_id = 'mzitu'
20 |     spider_name = 'mzitu'
21 |     allowed_domains = ['mzitu.com', 'meizitu.net']
22 |     start_requests = [Request(url='http://www.mzitu.com/xinggan/')]
23 | 
24 |     rules = (
25 |         Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
26 |              callback="save", priority=3),
27 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
28 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2),
29 |         Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0),
30 |     )
31 | 
32 |     def save(self, response):
33 |         if response.m_response:
34 |             if not os.path.exists("img"):
35 |                 os.mkdir("img")
36 |             with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
37 |                 fs.write(response.m_response.content)
38 |                 print("download success!")
39 | 
40 | 
41 | # if __name__ == '__main__':
42 | #     spider = SpiderCore(MezituProcessor(), batch_size=10).set_pipeline(PicPipeline()).start()
43 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/qcc_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | 
  5 | from sasila.system_normal.spider.spider_core import SpiderCore
  6 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
  7 | from sasila.system_normal.pipeline.text_pipeline import TextPipeline
  8 | from sasila.system_normal.processor.base_processor import BaseProcessor
  9 | from sasila.system_normal.downloader.http.spider_request import Request
 10 | from bs4 import BeautifulSoup as bs
 11 | import time
 12 | from sasila.system_normal.utils import logger
 13 | 
 14 | import traceback
 15 | 
 16 | if sys.version_info < (3, 0):
 17 |     reload(sys)
 18 |     sys.setdefaultencoding('utf-8')
 19 | 
 20 | 
 21 | class QccProcessor(BaseProcessor):
 22 |     spider_id = 'qcc'
 23 |     spider_name = 'qcc'
 24 |     allowed_domains = ['qichacha.com']
 25 | 
 26 |     start_requests = [
 27 |         Request(url='http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE')
 28 |     ]
 29 | 
 30 |     def process(self, response):
 31 |         if not response.m_response:
 32 |             logger.error(response.request.url)
 33 |             yield response.request
 34 |         if '<script>window.location.href=' in response.m_response.content:
 35 |             logger.error(response.m_response.content + "\n" + response.request.url)
 36 |             yield response.request
 37 |         soup = bs(response.m_response.content, "lxml")
 38 |         province_list = soup.select_one("dl#provinceOld").select("div.pull-left")[1].select("dd a")
 39 |         for province in province_list:
 40 |             province_name = province.string.strip()
 41 |             province_id = province["data-value"].strip()
 42 |             request = Request(
 43 |                     url="http://www.qichacha.com/search_getCityListHtml?province=" + province_id + "&q_type=1",
 44 |                     callback="get_city", priority=0)
 45 |             request.meta["province_name"] = province_name
 46 |             request.meta["province_id"] = province_id
 47 |             yield request
 48 | 
 49 |     def get_city(self, response):
 50 |         if not response.m_response:
 51 |             logger.error(response.request.url)
 52 |             yield response.request
 53 |         if '<script>window.location.href=' in response.m_response.content:
 54 |             logger.error(response.m_response.content + "\n" + response.request.url)
 55 |             yield response.request
 56 |         if response.m_response.content == "":
 57 |             request = Request(
 58 |                     url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
 59 |                         response.request.meta["province_id"] + "&",
 60 |                     callback="get_all_page", priority=1)
 61 |             request.meta["city_name"] = ""
 62 |             request.meta["city_id"] = ""
 63 |             request.meta["province_name"] = response.request.meta["province_name"]
 64 |             request.meta["province_id"] = response.request.meta["province_id"]
 65 |             yield request
 66 |         else:
 67 |             soup = bs(response.m_response.content, "lxml")
 68 |             city_list = soup.select("a")
 69 |             for city in city_list:
 70 |                 city_name = city.string.strip()
 71 |                 city_id = city["data-value"].strip()
 72 |                 request = Request(
 73 |                         url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
 74 |                             response.request.meta["province_id"] + "&city=" + city_id + "&",
 75 |                         callback="get_all_page", priority=1)
 76 |                 request.meta["city_name"] = city_name
 77 |                 request.meta["city_id"] = city_id
 78 |                 request.meta["province_name"] = response.request.meta["province_name"]
 79 |                 request.meta["province_id"] = response.request.meta["province_id"]
 80 |                 yield request
 81 | 
 82 |     def get_all_page(self, response):
 83 |         if not response.m_response:
 84 |             logger.error(response.request.url)
 85 |             yield response.request
 86 |         if '<script>window.location.href=' in response.m_response.content:
 87 |             logger.error(response.m_response.content + "\n" + response.request.url)
 88 |             yield response.request
 89 |         else:
 90 |             soup = bs(response.m_response.content, "lxml")
 91 |             try:
 92 |                 temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling()
 93 |                 if temp_page:
 94 |                     page = temp_page.select_one("a")
 95 |                     if page:
 96 |                         total_page = int(page.string.strip().replace("...", ""))
 97 |                     else:
 98 |                         total_page = 1
 99 |                 else:
100 |                     temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findPreviousSibling()
101 |                     if temp_page:
102 |                         page = temp_page.select_one("a")
103 |                         if page:
104 |                             total_page = int(page.string.strip().replace("...", ""))
105 |                         else:
106 |                             total_page = 1
107 |                     else:
108 |                         total_page = 1
109 |             except:
110 |                 total_page = 1
111 | 
112 |             now_page = 1
113 |             while now_page <= total_page:
114 |                 if response.request.meta["city_id"] == "":
115 |                     request = Request(
116 |                             url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
117 |                                 response.request.meta["province_id"] + "&p=" + str(now_page) + "&",
118 |                             callback="get_content", priority=2)
119 |                     request.meta["city_name"] = response.request.meta["city_name"]
120 |                     request.meta["city_id"] = response.request.meta["city_id"]
121 |                     request.meta["province_name"] = response.request.meta["province_name"]
122 |                     request.meta["province_id"] = response.request.meta["province_id"]
123 |                     yield request
124 |                 else:
125 |                     request = Request(
126 |                             url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str(
127 |                                     now_page) + "&province=" +
128 |                                 response.request.meta["province_id"] + "&city=" + response.request.meta[
129 |                                     "city_id"] + "&",
130 |                             callback="get_content", priority=2)
131 |                     request.meta["city_name"] = response.request.meta["city_name"]
132 |                     request.meta["city_id"] = response.request.meta["city_id"]
133 |                     request.meta["province_name"] = response.request.meta["province_name"]
134 |                     request.meta["province_id"] = response.request.meta["province_id"]
135 |                     yield request
136 |                 now_page += 1
137 | 
138 |     def get_content(self, response):
139 |         if not response.m_response:
140 |             logger.error(response.request.url)
141 |             yield response.request
142 |         if '<script>window.location.href=' in response.m_response.content:
143 |             logger.error(response.m_response.content + "\n" + response.request.url)
144 |             yield response.request
145 |         soup = bs(response.m_response.content, "lxml")
146 |         content_list = soup.select("table.m_srchList tbody tr")
147 |         for content in content_list:
148 |             try:
149 |                 result_item = dict()
150 |                 result_item["province"] = response.request.meta["province_name"]
151 |                 result_item["city"] = response.request.meta["city_name"]
152 |                 result_item["company_name"] = content.select("td")[1].text.split('\n')[0].strip()
153 |                 result_item["company_man"] = content.select("td")[1].text.split('\n')[1].strip().replace("企业法人：", "")
154 |                 result_item["company_telephone"] = content.select("td")[1].text.split('\n')[2].strip().replace("联系方式：",
155 |                                                                                                                "")
156 |                 result_item["company_address"] = content.select("td")[1].text.split('\n')[3].strip()
157 |                 if "地址：" in result_item["company_address"]:
158 |                     result_item["company_address"] = result_item["company_address"].replace("地址：", "")
159 |                 else:
160 |                     result_item["company_address"] = ""
161 |                 result_item["company_registered_capital"] = content.select("td")[2].text.strip()
162 |                 result_item["company_registered_time"] = content.select("td")[3].text.strip()
163 |                 result_item["company_status"] = content.select("td")[4].text.strip()
164 |                 result_item["source"] = "企查查"
165 |                 result_item["update_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
166 |                 yield result_item
167 |             except Exception:
168 |                 print(traceback.format_exc())
169 | 
170 | 
171 | # qcc_spider = SpiderCore(QccProcessor(), time_sleep=1).set_pipeline(KafkaPipeline()).set_pipeline(
172 | #         TextPipeline()).set_pipeline(ConsolePipeline())
173 | # if __name__ == '__main__':
174 | #     qcc_spider.start()
175 | 


--------------------------------------------------------------------------------
/sasila/system_normal/processor/test_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from sasila.system_normal.spider.spider_core import SpiderCore
 4 | from sasila.system_normal.processor.base_processor import BaseProcessor
 5 | from sasila.system_normal.downloader.http.spider_request import Request
 6 | from sasila.system_normal.utils.decorator import checkResponse
 7 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
 8 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
 9 | from sasila.system_normal.pipeline.pipe_item import pipeItem
10 | from sasila.system_normal.pipeline.test_pipeline import TestPipeline
11 | 
12 | from bs4 import BeautifulSoup as bs
13 | import hashlib
14 | import time
15 | import random
16 | import sys
17 | 
18 | if sys.version_info < (3, 0):
19 |     reload(sys)
20 |     sys.setdefaultencoding('utf-8')
21 | 
22 | 
23 | 
24 | class TEST_Processor(BaseProcessor):
25 |     spider_id = 'zhu_spider'
26 |     spider_name = 'zhu_spider'
27 |     allowed_domains = ['zhuwang.cc']
28 |     start_requests = [Request(url='http://www.zhuwang.cc/list-58-1.html', priority=0)]
29 | 
30 |     @checkResponse
31 |     def process(self, response):
32 |         soup = bs(response.m_response.content, 'lxml')
33 | 
34 |         page_list = soup.select('div.zxpage a')
35 |         total_page = int(page_list[page_list.__len__()-2].text)
36 |         page = 1
37 |         while page<=total_page:
38 |             yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page,callback=self.process_page, priority=0,duplicate_remove=False)
39 |             page +=1
40 | 
41 |     @checkResponse
42 |     def process_page(self, response):
43 |         soup = bs(response.m_response.content, 'lxml')
44 | 
45 |         zhu_div_list = soup.select('div.zxleft ul li')
46 |         for zhu_div in zhu_div_list:
47 |             detail_url = zhu_div.select('a')[0]['href']
48 |             img_url = zhu_div.select('a img')[0]['src']
49 |             title = zhu_div.select('a img')[0]['alt'].strip()
50 |             shortDes = zhu_div.select('p.zxleft32 a')[0].text
51 | 
52 |             md5 = hashlib.md5()
53 |             rand_name = str(time.time()) + str(random.random())
54 |             md5.update(rand_name.encode(encoding='utf-8'))
55 |             img_name = md5.hexdigest() + '.jpg'
56 | 
57 |             request = Request(url=img_url, priority=1, callback=self.process_pic)
58 |             request.meta['img_name'] = img_name
59 |             yield request
60 | 
61 |             request = Request(url=detail_url, priority=1, callback=self.process_detail)
62 |             request.meta['title'] = title
63 |             request.meta['shortDes'] = shortDes
64 |             request.meta['img_name'] = img_name
65 |             yield request
66 | 
67 |     @checkResponse
68 |     def process_pic(self, response):
69 |         result = response.m_response.content
70 |         yield pipeItem(['save'],result)
71 | 
72 |     @checkResponse
73 |     def process_detail(self, response):
74 |         soup = bs(response.m_response.content, 'lxml')
75 | 
76 |         dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace('来源： ','').replace('来源：','').split(' ')
77 |         date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace('|','')
78 |         newsFrom = dd_tail[0].strip()
79 | 
80 |         result = dict()
81 |         result['date_time'] = date_time
82 |         result['newsFrom'] = newsFrom
83 | 
84 |         yield pipeItem(['console','test'],result)
85 | 
86 | if __name__ == '__main__':
87 |     SpiderCore(TEST_Processor()).set_pipeline(ConsolePipeline(),'console')\
88 |         .set_pipeline(PicPipeline(),'save').set_pipeline(TestPipeline(),'test').start()


--------------------------------------------------------------------------------
/sasila/system_normal/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/scheduler/bloom_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import redis
 5 | from hashlib import md5
 6 | from sasila.settings import default_settings
 7 | 
 8 | if sys.version_info < (3, 0):
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | 
12 | 
13 | class SimpleHash(object):
14 |     def __init__(self, cap, seed):
15 |         self.cap = cap
16 |         self.seed = seed
17 | 
18 |     def hash(self, value):
19 |         ret = 0
20 |         for i in range(len(value)):
21 |             ret += self.seed * ret + ord(value[i])
22 |         return (self.cap - 1) & ret
23 | 
24 | 
25 | class BloomFilter(object):
26 |     def __init__(self, host=default_settings.REDIS_HOST, port=default_settings.REDIS_PORT, db=0, block_num=1, key='bloomfilter'):
27 |         """
28 |         :param host: the host of Redis
29 |         :param port: the port of Redis
30 |         :param db: witch db in Redis
31 |         :param block_num: one blockNum for about 90,000,000; if you have more strings for filtering, increase it.
32 |         :param key: the key's name in Redis
33 |         """
34 |         self.server = redis.Redis(host=host, port=port, db=db)
35 |         self.bit_size = 1 << 31  # Redis的String类型最大容量为512M，现使用256M
36 |         self.seeds = [5, 7, 11, 13, 31, 37, 61]
37 |         self.key = key
38 |         self.blockNum = block_num
39 |         self.hashfunc = []
40 |         for seed in self.seeds:
41 |             self.hashfunc.append(SimpleHash(self.bit_size, seed))
42 | 
43 |     def is_contains(self, str_input):
44 |         if not str_input:
45 |             return False
46 |         m5 = md5()
47 |         m5.update(str_input)
48 |         str_input = m5.hexdigest()
49 |         ret = True
50 |         name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
51 |         for f in self.hashfunc:
52 |             loc = f.hash(str_input)
53 |             ret = ret & self.server.getbit(name, loc)
54 |         return ret
55 | 
56 |     def insert(self, str_input):
57 |         m5 = md5()
58 |         m5.update(str_input)
59 |         str_input = m5.hexdigest()
60 |         name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
61 |         for f in self.hashfunc:
62 |             loc = f.hash(str_input)
63 |             self.server.setbit(name, loc, 1)
64 | 
65 | 
66 | # if __name__ == '__main__':
67 | #     bf1 = BloomFilter(key='test1')
68 | #     bf2 = BloomFilter(key='test2')
69 | #     bf3 = BloomFilter(key='test3')
70 | #     bf4 = BloomFilter(key='test4')
71 | #     bf1.insert('http://www.baidu.com')
72 | #     bf2.insert('http://www.baidu.com')
73 | #     bf3.insert('http://www.baidu.com')
74 | #     bf4.insert('http://www.baidu.com')
75 | #
76 | #     print bf1.server.keys()
77 | #     bf1.clear()
78 | #     print bf1.server.keys()
79 | #     bf2.clear()
80 | #     print bf1.server.keys()
81 | #     bf3.clear()
82 | #     print bf1.server.keys()
83 | #     bf4.clear()
84 | #     print bf1.server.keys()
85 | 


--------------------------------------------------------------------------------
/sasila/system_normal/scheduler/queue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import redis
 5 | from sasila.system_normal.scheduler.bloom_filter import BloomFilter
 6 | from sasila.system_normal.utils.reqser import request_to_dict, request_from_dict
 7 | from sasila.settings import default_settings
 8 | 
 9 | if sys.version_info < (3, 0):
10 |     reload(sys)
11 |     sys.setdefaultencoding('utf-8')
12 |     import cPickle
13 | else:
14 |     import pickle as cPickle
15 | 
16 | 
17 | class Base(object):
18 |     """Per-spider base queue class"""
19 | 
20 |     def __init__(self, processor):
21 |         self.task_id = processor.spider_id
22 |         self.processor = processor
23 |         self._filter = BloomFilter(key=self.task_id)
24 |         self._server = redis.StrictRedis(host=default_settings.REDIS_HOST, port=default_settings.REDIS_PORT)
25 | 
26 |     def __len__(self):
27 |         """Return the length of the queue"""
28 |         raise NotImplementedError
29 | 
30 |     def push(self, request):
31 |         """Push a request"""
32 |         raise NotImplementedError
33 | 
34 |     def pop(self):
35 |         """Pop a request"""
36 |         raise NotImplementedError
37 | 
38 |     def clear_queue(self):
39 |         self._server.delete(self.task_id)
40 | 
41 |     def clear_filter(self):
42 |         keys = self._server.keys(self.task_id + '*')
43 |         for key in keys:
44 |             if key != self.task_id:
45 |                 self._server.delete(key)
46 | 
47 |     def clear(self):
48 |         keys = self._server.keys(self.task_id + '*')
49 |         for key in keys:
50 |             self._server.delete(key)
51 | 
52 | 
53 | class PriorityQueue(Base):
54 |     def get_pipe(self):
55 |         return self._server.pipeline()
56 | 
57 |     def push_pipe(self, request, pipe):
58 |         score = -request.priority
59 |         data = cPickle.dumps(request_to_dict(request, self.processor), protocol=-1)
60 |         if not request.duplicate_remove:
61 |             pipe.execute_command('ZADD', self.task_id, score, data)
62 |         else:
63 |             if not self._filter.is_contains(data):
64 |                 pipe.execute_command('ZADD', self.task_id, score, data)
65 |                 self._filter.insert(data)
66 | 
67 |     def push(self, request):
68 |         score = -request.priority
69 |         data = cPickle.dumps(request_to_dict(request, self.processor), protocol=-1)
70 |         if not request.duplicate_remove:
71 |             self._server.execute_command('ZADD', self.task_id, score, data)
72 |         else:
73 |             if not self._filter.is_contains(data):
74 |                 self._server.execute_command('ZADD', self.task_id, score, data)
75 |                 self._filter.insert(data)
76 | 
77 |     def pop(self):
78 |         pipe = self._server.pipeline()
79 |         pipe.multi()
80 |         pipe.zrange(self.task_id, 0, 0).zremrangebyrank(self.task_id, 0, 0)
81 |         results, count = pipe.execute()
82 |         if results:
83 |             return request_from_dict(cPickle.loads(results[0]), self.processor)
84 |         else:
85 |             return None
86 | 
87 |     def __len__(self):
88 |         return self._server.zcard(self.task_id)
89 | 
90 | 
91 | # if __name__ == '__main__':
92 | #     queue = PriorityQueue("test")
93 | #     print queue
94 | 


--------------------------------------------------------------------------------
/sasila/system_normal/spider/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | 
5 | if sys.version_info < (3, 0):
6 |     reload(sys)
7 |     sys.setdefaultencoding('utf-8')


--------------------------------------------------------------------------------
/sasila/system_normal/spider/spider_core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | import types
  5 | from sasila.system_normal.downloader.http.spider_request import Request
  6 | from sasila.system_normal.downloader.requests_downloader import RequestsDownLoader
  7 | from sasila.system_normal.scheduler.queue import PriorityQueue
  8 | from sasila.system_normal.utils import logger
  9 | from sasila.system_normal.utils.httpobj import urlparse_cached
 10 | from sasila.system_normal.downloader.selenium_downloader import SeleniumDownLoader
 11 | from sasila.system_normal.pipeline.pipe_item import pipeItem
 12 | from sasila.settings import default_settings
 13 | import uuid
 14 | import re
 15 | import time
 16 | import traceback
 17 | 
 18 | if sys.version_info < (3, 0):
 19 |     reload(sys)
 20 |     sys.setdefaultencoding('utf-8')
 21 | 
 22 | 
 23 | def _priority_compare(r1, r2):
 24 |     return r2.priority - r1.priority
 25 | 
 26 | 
 27 | def _priority_compare_key(item):
 28 |     return item.priority
 29 | 
 30 | 
 31 | class SpiderCore(object):
 32 |     def __init__(self, processor=None, downloader=None, use_proxy=False, scheduler=None, batch_size=None,
 33 |                  time_sleep=None, test=False):
 34 |         # 用于测试,爬取成功第一个以后结束
 35 |         self.test = test
 36 |         self._processor = processor
 37 |         self._host_regex = self._get_host_regex()
 38 |         self._spider_status = 'stopped'
 39 |         self._pipelines = {}
 40 |         self._time_sleep = time_sleep
 41 |         if time_sleep:
 42 |             self._batch_size = 0
 43 |         else:
 44 |             if isinstance(downloader, SeleniumDownLoader):
 45 |                 self._batch_size = default_settings.DRIVER_POOL_SIZE
 46 |             else:
 47 |                 if batch_size:
 48 |                     self._batch_size = batch_size - 1
 49 |                 else:
 50 |                     self._batch_size = 9
 51 |         self._spider_name = processor.spider_name
 52 |         self._spider_id = processor.spider_id
 53 |         self._process_count = 0
 54 | 
 55 |         if not downloader:
 56 |             self._downloader = RequestsDownLoader(use_proxy=use_proxy)
 57 | 
 58 |         if not scheduler:
 59 |             self._queue = PriorityQueue(self._processor)
 60 |         else:
 61 |             self._queue = scheduler
 62 | 
 63 |     def create(self, processor):
 64 |         self._processor = processor
 65 |         return self
 66 | 
 67 |     def set_scheduler(self, scheduler):
 68 |         self._queue = scheduler
 69 |         return self
 70 | 
 71 |     def set_downloader(self, downloader):
 72 |         self._downloader = downloader
 73 |         return self
 74 | 
 75 |     def set_pipeline(self, pipeline=None, pipeline_name=None, ):
 76 |         if not pipeline_name:
 77 |             pipeline_name = str(uuid.uuid1())
 78 |         self._pipelines[pipeline_name] = pipeline
 79 |         return self
 80 | 
 81 |     def stop(self):
 82 |         if self._spider_status == 'stopped':
 83 |             logger.info("STOP %s SUCCESS" % self._spider_id)
 84 |             return
 85 |         elif self._spider_status == 'stopping':
 86 |             while self._spider_status == 'stopping':
 87 |                 pass
 88 |         elif self._spider_status == 'start':
 89 |             self._spider_status = 'stopping'
 90 |             while self._spider_status == 'stopping':
 91 |                 pass
 92 | 
 93 |     def start(self):
 94 |         try:
 95 |             logger.info("START %s SUCCESS" % self._spider_id)
 96 |             self._spider_status = 'start'
 97 |             self._queue = PriorityQueue(self._processor)
 98 |             if len(self._processor.start_requests) > 0:
 99 |                 for start_request in self._processor.start_requests:
100 |                     if self._should_follow(start_request):
101 |                         start_request.duplicate_remove = False
102 |                         self._queue.push(start_request)
103 |                         logger.info("start request:" + str(start_request))
104 |             for batch in self._batch_requests():
105 |                 if len(batch) > 0:
106 |                     self._crawl(batch)
107 |                     if self.test:
108 |                         if self._process_count > 0:
109 |                             return
110 |                 if self._spider_status == 'stopping':
111 |                     break
112 |             self._spider_status = 'stopped'
113 |             logger.info("STOP %s SUCCESS" % self._spider_id)
114 |         except Exception:
115 |             logger.info("%s -- Exception -- Stopped -- %s" % (self._spider_id, traceback.format_exc()))
116 |             self._spider_status = 'stopped'
117 | 
118 |     def restart(self):
119 |         self._queue = PriorityQueue(self._processor)
120 |         self._queue.clear()
121 |         self.start()
122 | 
123 |     def _batch_requests(self):
124 |         batch = []
125 |         count = 0
126 |         while True:
127 |             count += 1
128 |             if len(batch) > self._batch_size or count > self._batch_size:
129 |                 if sys.version_info < (3, 0):
130 |                     batch.sort(_priority_compare)
131 |                 else:
132 |                     batch.sort(key=_priority_compare_key, reverse=True)
133 |                 yield batch
134 |                 batch = []
135 |                 count = 0
136 |             temp_request = self._queue.pop()
137 |             if temp_request:
138 |                 if not temp_request.callback:
139 |                     temp_request.callback = self._processor.process
140 |                 batch.append(temp_request)
141 | 
142 |     def _crawl(self, batch):
143 |         responses = self._downloader.download(batch)
144 |         if self._time_sleep:
145 |             time.sleep(self._time_sleep)
146 |         for response in responses:
147 |             callback = response.request.callback(response)
148 |             if isinstance(callback, types.GeneratorType):
149 |                 pipe = self._queue.get_pipe()
150 |                 for item in callback:
151 |                     if isinstance(item, Request):
152 |                         # logger.info("push request to queue..." + str(item))
153 |                         if self._should_follow(item):
154 |                             self._queue.push_pipe(item, pipe)
155 |                     else:
156 |                         if isinstance(item, pipeItem):
157 |                             # 如果返回对象是pipeItem，则用对应的pipeline处理
158 |                             self._process_count += 1
159 |                             for pipename in item.pipenames:
160 |                                 if pipename in self._pipelines:
161 |                                     self._pipelines[pipename].process_item(item.result)
162 |                             if self.test:
163 |                                 if self._process_count > 0:
164 |                                     return
165 |                         else:
166 |                             # 如果返回对象不是pipeItem，则默认用每个pipeline处理
167 |                             self._process_count += 1
168 |                             for pipeline in self._pipelines.itervalues():
169 |                                 pipeline.process_item(item)
170 |                             if self.test:
171 |                                 if self._process_count > 0:
172 |                                     return
173 |                 pipe.execute()
174 |             elif isinstance(callback, Request):
175 |                 # logger.info("push request to queue..." + str(back))
176 |                 if self._should_follow(callback):
177 |                     self._queue.push(callback)
178 |             elif isinstance(callback, pipeItem):
179 |                 # 如果返回对象是pipeItem，则用对应的pipeline处理
180 |                 self._process_count += 1
181 |                 for pipename in callback.pipenames:
182 |                     if pipename in self._pipelines:
183 |                         self._pipelines[pipename].process_item(callback.result)
184 |             else:
185 |                 # 如果返回对象不是pipeItem，则默认用每个pipeline处理
186 |                 self._process_count += 1
187 |                 for pipeline in self._pipelines.itervalues:
188 |                     pipeline.process_item(item)
189 |                 if self.test:
190 |                     if self._process_count > 0:
191 |                         return
192 | 
193 |     def _should_follow(self, request):
194 |         regex = self._host_regex
195 |         # hostname can be None for wrong urls (like javascript links)
196 |         host = urlparse_cached(request).hostname or ''
197 |         return bool(regex.search(host))
198 | 
199 |     def _get_host_regex(self):
200 |         """Override this method to implement a different offsite policy"""
201 |         allowed_domains = getattr(self._processor, 'allowed_domains', None)
202 |         if not allowed_domains:
203 |             return re.compile('')  # allow all by default
204 |         regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
205 |         return re.compile(regex)
206 | 
207 | 
208 | # if __name__ == '__main__':
209 | #     batch = [Request(priority=0),
210 | #              Request(priority=2),
211 | #              Request(priority=4),
212 | #              Request(priority=8),
213 | #              Request(priority=6),
214 | #              Request(priority=1),
215 | #              Request(priority=10)]
216 | #
217 | #     # batch.sort(key=_priority_compare_key, reverse=True)
218 | #     batch.sort(_priority_compare)
219 | #     for b in batch:
220 | #         print(b.priority)
221 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import logging
 5 | import os
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 | 
11 | if not os.path.exists("log"):
12 |     os.mkdir("log")
13 | logger = logging.getLogger("SASILA")
14 | logger.setLevel(logging.DEBUG)
15 | # 建立一个filehandler来把日志记录在文件里，级别为debug以上
16 | fh = logging.FileHandler("log/SASILA.log")
17 | fh.setLevel(logging.ERROR)
18 | # 建立一个streamhandler来把日志打在CMD窗口上，级别为error以上
19 | ch = logging.StreamHandler()
20 | ch.setLevel(logging.INFO)
21 | # 设置日志格式
22 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
23 | ch.setFormatter(formatter)
24 | fh.setFormatter(formatter)
25 | # 将相应的handler添加在logger对象中
26 | logger.addHandler(ch)
27 | logger.addHandler(fh)
28 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/cookie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import json
 5 | 
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 | 
10 | 
11 | def formart_selenium_cookies(cookies):
12 |     cookie_dict = dict()
13 |     for c in cookies:
14 |         cookie_dict[c['name']] = c['value']
15 |     return json.dumps(cookie_dict).decode('unicode-escape')
16 | 
17 | 
18 | def selenium_add_cookies(cookies, web):
19 |     cookie_list = [{'name': c[0], 'value': c[1], 'path': '/', 'domain': '.jd.com', 'expiry': 4070880000} for c in
20 |                    dict(json.loads(cookies)).items()]
21 |     for c in cookie_list:
22 |         web.add_cookie(c)
23 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/decorator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import functools
  5 | from sasila.system_normal.utils import logger
  6 | import traceback
  7 | import time
  8 | import uuid
  9 | import types
 10 | 
 11 | 
 12 | def checkResponse(func):
 13 |     @functools.wraps(func)
 14 |     def wrapper(self, response):
 15 |         if not response.m_response:
 16 |             response.request.meta['retry'] += 1
 17 |             # 最多重试3次
 18 |             if response.request.meta['retry'] < 4:
 19 |                 retry_str = '\nrequest has been push to queue again!'
 20 |                 yield response.request
 21 |             else:
 22 |                 retry_str = '\nrequest has been try max times! will not push again!'
 23 | 
 24 |             if response.m_response is None:
 25 |                 logger.error('response.m_response is None'
 26 |                              + '\nURL : ' + response.request.url
 27 |                              + retry_str)
 28 |             else:
 29 |                 # 记录返回数据
 30 |                 log_name = 'log/' + str(uuid.uuid1()) + '_log.txt'
 31 |                 with open(log_name, 'wb') as f:
 32 |                     f.write(response.m_response.content)
 33 | 
 34 |                 logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】'
 35 |                              + '\nURL : ' + response.request.url
 36 |                              + '\nresponse: ' + log_name
 37 |                              + retry_str)
 38 |         else:
 39 |             try:
 40 |                 process = func(self, response)
 41 |                 if isinstance(process, types.GeneratorType):
 42 |                     for callback in process:
 43 |                         yield callback
 44 |             except Exception:
 45 |                 # 记录返回数据
 46 |                 log_name = 'log/' + str(uuid.uuid1()) + '_log.txt'
 47 |                 with open(log_name, 'wb') as f:
 48 |                     f.write(response.m_response.content)
 49 | 
 50 |                 logger.error('process error: ' + response.request.url
 51 |                              + '\nresponse: ' + log_name
 52 |                              + '\n' + traceback.format_exc())
 53 | 
 54 |     return wrapper
 55 | 
 56 | 
 57 | def checkResponseWithTime(func):
 58 |     @functools.wraps(func)
 59 |     def wrapper(self, response):
 60 |         if not response.m_response:
 61 |             if response.m_response is None:
 62 |                 logger.error(
 63 |                         'response.m_response is None and url : ' + response.request.url + ' and request has been push to queue again!')
 64 |             else:
 65 |                 logger.error(
 66 |                         'response.m_response is failed 【' + str(
 67 |                                 response.m_response.status_code) + '】 and url : ' + response.request.url + ' content:' + response.m_response.content + ' and request has been push to queue again!')
 68 |             yield response.request
 69 |         else:
 70 |             process = func(self, response)
 71 |             if process is not None:
 72 |                 try:
 73 |                     start = time.clock()
 74 |                     for callback in process:
 75 |                         yield callback
 76 |                     logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start))
 77 |                 except Exception:
 78 |                     logger.error(
 79 |                             'process error: ' + response.request.url + '\r\n' + response.m_response.content + '\r\n' + traceback.format_exc())
 80 | 
 81 |     return wrapper
 82 | 
 83 | 
 84 | def timeit(func):
 85 |     @functools.wraps(func)
 86 |     def wrapper(*args, **kwargs):
 87 |         start = time.clock()
 88 |         ret = func(*args, **kwargs)
 89 |         logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start))
 90 |         return ret
 91 | 
 92 |     return wrapper
 93 | 
 94 | 
 95 | def timeit_generator(func):
 96 |     @functools.wraps(func)
 97 |     def wrapper(*args, **kwargs):
 98 |         rets = func(*args, **kwargs)
 99 |         start = time.clock()
100 |         for ret in rets:
101 |             yield ret
102 |         logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start))
103 | 
104 |     return wrapper
105 | 
106 | 
107 | def tryCatch(func):
108 |     @functools.wraps(func)
109 |     def wrapper(*args, **kwargs):
110 |         try:
111 |             ret = func(*args, **kwargs)
112 |             return ret
113 |         except Exception:
114 |             logger.info('【%s】error:%s' % (func.__name__, traceback.format_exc()))
115 | 
116 |     return wrapper
117 | 
118 | 
119 | def tryCatch_generator(func):
120 |     @functools.wraps(func)
121 |     def wrapper(*args, **kwargs):
122 |         try:
123 |             rets = func(*args, **kwargs)
124 |             for ret in rets:
125 |                 yield ret
126 |         except Exception:
127 |             logger.info('【%s】error:%s' % (func.__name__, traceback.format_exc()))
128 | 
129 |     return wrapper
130 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/httpobj.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import weakref
 5 | # from six.moves.urllib.parse import urlparse
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 |     from urlparse import urlparse
10 | else:
11 |     from urllib.parse import urlparse
12 | 
13 | _urlparse_cache = weakref.WeakKeyDictionary()
14 | 
15 | 
16 | def urlparse_cached(request_or_response):
17 |     if request_or_response not in _urlparse_cache:
18 |         _urlparse_cache[request_or_response] = urlparse(request_or_response.url)
19 |     return _urlparse_cache[request_or_response]
20 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/jd_code.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | SUCCESS = "0"
5 | NEED_SMS = "1"
6 | SMS_FAIL = "2"
7 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/kafka_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # http://kafka-python.readthedocs.io/en/master/
 5 | # import sys
 6 | # from kafka import KafkaProducer
 7 | #
 8 | # reload(sys)
 9 | # sys.setdefaultencoding('utf-8')
10 | #
11 | # producer = KafkaProducer(bootstrap_servers='192.168.3.212:9092,192.168.3.211:9092', retries=5)
12 | #
13 | #
14 | # def send_message(topic, content):
15 | #     producer.send(topic, content)
16 | #     producer.flush()
17 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/progress_bar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | 
 5 | if sys.version_info < (3, 0):
 6 |     reload(sys)
 7 |     sys.setdefaultencoding('utf-8')
 8 | 
 9 | 
10 | class ProgressBar:
11 |     def __init__(self, count=0, total=0, width=50):
12 |         self.count = count
13 |         self.total = total
14 |         self.width = width
15 | 
16 |     def move(self, log_str=None):
17 |         self.count += 1
18 |         sys.stdout.write(' ' * (self.width + 9) + '\r')
19 |         sys.stdout.flush()
20 |         if log_str:
21 |             print(log_str)
22 |         progress = self.width * self.count / self.total
23 |         sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
24 |         sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
25 |         if progress == self.width:
26 |             sys.stdout.write('\n')
27 |         sys.stdout.flush()
28 | 
29 | 
30 | # if __name__ == '__main__':
31 | #     import time
32 | #     bar = ProgressBar(total=10)
33 | #     for i in range(10):
34 | #         # bar.move('We have arrived at: ' + str(i + 1))
35 | #         bar.move()
36 | #         time.sleep(0.2)
37 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/python.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import six
 5 | 
 6 | if sys.version_info < (3, 0):
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf-8')
 9 | 
10 | 
11 | def to_unicode(text, encoding=None, errors='strict'):
12 |     """Return the unicode representation of a bytes object `text`. If `text`
13 |     is already an unicode object, return it as-is."""
14 |     if isinstance(text, six.text_type):
15 |         return text
16 |     if not isinstance(text, (bytes, six.text_type)):
17 |         raise TypeError('to_unicode must receive a bytes, str or unicode '
18 |                         'object, got %s' % type(text).__name__)
19 |     if encoding is None:
20 |         encoding = 'utf-8'
21 |     return text.decode(encoding, errors)
22 | 
23 | 
24 | def to_native_str(text, encoding=None, errors='strict'):
25 |     """ Return str representation of `text`
26 |     (bytes in Python 2.x and unicode in Python 3.x). """
27 |     if six.PY2:
28 |         return to_bytes(text, encoding, errors)
29 |     else:
30 |         return to_unicode(text, encoding, errors)
31 | 
32 | 
33 | def to_bytes(text, encoding=None, errors='strict'):
34 |     """Return the binary representation of `text`. If `text`
35 |     is already a bytes object, return it as-is."""
36 |     if isinstance(text, bytes):
37 |         return text
38 |     if not isinstance(text, six.string_types):
39 |         raise TypeError('to_bytes must receive a unicode, str or bytes '
40 |                         'object, got %s' % type(text).__name__)
41 |     if encoding is None:
42 |         encoding = 'utf-8'
43 |     return text.encode(encoding, errors)
44 | 


--------------------------------------------------------------------------------
/sasila/system_normal/utils/reqser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | import six
 5 | from sasila.system_normal.downloader.http.spider_request import Request
 6 | from sasila.system_normal.utils.python import to_unicode, to_native_str
 7 | 
 8 | if sys.version_info < (3, 0):
 9 |     reload(sys)
10 |     sys.setdefaultencoding('utf-8')
11 | 
12 | 
13 | def request_to_dict(request, processor=None):
14 |     """Convert Request object to a dict.
15 | 
16 |     If a spider is given, it will try to find out the name of the spider method
17 |     used in the callback and store that as the callback.
18 |     """
19 |     cb = request.callback
20 |     if callable(cb):
21 |         cb = _find_method(processor, cb)
22 |     eb = request.errback
23 |     if callable(eb):
24 |         eb = _find_method(processor, eb)
25 |     d = {
26 |         'url': to_unicode(request.url),  # urls should be safe (safe_string_url)
27 |         'callback': cb,
28 |         'errback': eb,
29 |         'data': request.data,
30 |         'json': request.json,
31 |         'allow_redirects': request.allow_redirects,
32 |         'duplicate_remove': request.duplicate_remove,
33 |         'timeout': request.timeout,
34 |         'method': request.method,
35 |         'headers': request.headers,
36 |         'cookies': request.cookies,
37 |         'meta': request.meta,
38 |         'priority': request.priority,
39 |     }
40 |     return d
41 | 
42 | 
43 | def request_from_dict(d, processor=None):
44 |     """Create Request object from a dict.
45 | 
46 |     If a spider is given, it will try to resolve the callbacks looking at the
47 |     spider for methods with the same name.
48 |     """
49 |     cb = d['callback']
50 |     if cb and processor:
51 |         cb = _get_method(processor, cb)
52 |     eb = d['errback']
53 |     if eb and processor:
54 |         eb = _get_method(processor, eb)
55 |     return Request(
56 |             url=to_native_str(d['url']),
57 |             data=d['data'],
58 |             json=d['json'],
59 |             allow_redirects=d['allow_redirects'],
60 |             duplicate_remove=d['duplicate_remove'],
61 |             timeout=d['timeout'],
62 |             callback=cb,
63 |             errback=eb,
64 |             method=d['method'],
65 |             headers=d['headers'],
66 |             cookies=d['cookies'],
67 |             meta=d['meta'],
68 |             priority=d['priority'], )
69 | 
70 | 
71 | def _find_method(obj, func):
72 |     if obj:
73 |         try:
74 |             func_self = six.get_method_self(func)
75 |         except AttributeError:  # func has no __self__
76 |             pass
77 |         else:
78 |             if func_self is obj:
79 |                 return six.get_method_function(func).__name__
80 |     raise ValueError("Function %s is not a method of: %s" % (func, obj))
81 | 
82 | 
83 | def _get_method(obj, name):
84 |     name = str(name)
85 |     try:
86 |         return getattr(obj, name)
87 |     except AttributeError:
88 |         raise ValueError("Method %r not found in: %s" % (name, obj))
89 | 


--------------------------------------------------------------------------------
/sasila/system_web/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from flask import Flask
 5 | from sasila.system_normal.blueprints.slow_spiders import slow_spider
 6 | 
 7 | if sys.version_info < (3, 0):
 8 |     reload(sys)
 9 |     sys.setdefaultencoding('utf-8')
10 | 
11 | app = Flask(__name__)
12 | 
13 | app.register_blueprint(slow_spider, url_prefix='/slow_spider')
14 | 
15 | 
16 | @app.route('/')
17 | def index():
18 |     return 'welcome to sasila!'
19 | 
20 | 
21 | def start():
22 |     app.run(host='0.0.0.0', threaded=True)
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from setuptools import setup, find_packages
 4 | import sasila
 5 | 
 6 | extras_require_all = [
 7 |     'Flask>=0.11.1',
 8 |     'redis>=2.10.5',
 9 |     'requests>=2.13.0',
10 |     'six>=1.10.0',
11 |     'SQLAlchemy>=1.1.4',
12 |     'grequests>=0.3.0',
13 |     'selenium>=2.53.6',
14 |     'lxml>=3.7.2',
15 |     'beautifulsoup4>=4.6.0',
16 | ]
17 | 
18 | setup(
19 |         name='sasila',
20 |         version=sasila.__version__,
21 |         description=(
22 |             'a simple spider system'
23 |         ),
24 |         author='DaVinciDW',
25 |         author_email='darkwings_love@163.com',
26 |         maintainer='DaVinciDW',
27 |         maintainer_email='darkwings_love@163.com',
28 |         license='Apache License, Version 2.0',
29 |         packages=find_packages(exclude=['tests*']),
30 |         platforms=["all"],
31 |         url='https://github.com/DarkSand/Sasila',
32 |         install_requires=extras_require_all,
33 |         extras_require={
34 |             'all': extras_require_all,
35 |             'test': [
36 |                 'unittest2>=0.5.1',
37 |                 'coverage',
38 |             ]
39 |         },
40 |         classifiers=[
41 |             'Development Status :: 2 - Pre-Alpha',
42 |             "Environment :: Web Environment",
43 |             "Intended Audience :: Developers",
44 |             "Operating System :: OS Independent",
45 |             "Topic :: Text Processing :: Indexing",
46 |             "Topic :: Utilities",
47 |             "Topic :: Internet",
48 |             "Topic :: Software Development :: Libraries :: Python Modules",
49 |             'Programming Language :: Python :: 2.7',
50 |             'Programming Language :: Python :: 3.4',
51 |         ],
52 |         test_suite='tests.all_suite',
53 | )
54 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import unittest2 as unittest
5 | 
6 | all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py")
7 | 


--------------------------------------------------------------------------------
/tests/test_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import unittest2 as unittest
 4 | from sasila.system_normal.processor.test_processor import TEST_Processor
 5 | from sasila.system_normal.spider.spider_core import SpiderCore
 6 | from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
 7 | from sasila.system_normal.pipeline.pic_pipeline import PicPipeline
 8 | from sasila.system_normal.pipeline.test_pipeline import TestPipeline
 9 | 
10 | 
11 | class TestProcessor(unittest.TestCase):
12 |     def test_car_processor(self):
13 |         test_pipeline = TestPipeline()
14 |         SpiderCore(TEST_Processor(),test=True).set_pipeline(ConsolePipeline(),'console').set_pipeline(PicPipeline(),'save')\
15 |             .set_pipeline(test_pipeline,'test').start()
16 |         self.assertIn('2017',test_pipeline.result['date_time'])
17 | 


--------------------------------------------------------------------------------