├── .gitignore ├── LICENSE ├── README.md ├── cooperator ├── CoperProxy.py ├── config.py └── start.py ├── master ├── MasterMain.py ├── basic_func.py ├── config.py └── start.py └── worker ├── WorkerMain.py ├── basic_func.py ├── config.py ├── db.py ├── filters.py └── start.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ugly-Distributed-Crawler 2 | ## 简陋的分布式爬虫 3 | 新手向,基于Redis构建的分布式爬虫。 4 | 以爬取考研网的贴子为例,利用 PyQuery, lxml 进行解析,将符合要求的文章文本存入MySQ数据库中。 5 | ## 结构简介 6 | #### cooperator 7 | 协作模块,用于为Master&Worker模块提供代理IP支持 8 | #### master 9 | 提取满足条件的文章url,并交给Worker进一步处理 10 | #### Worker 11 | 解析文章内容,将符合要求的存入数据库 12 | 13 | ## 环境依赖 ## 14 | sqlalchemy => 1.0.13 15 | pyquery => 1.2.17 16 | requests => 2.12.3 17 | redis => 2.10.5 18 | lxml => 3.6.0 19 | > 1. 需要预先安装MySQL-server 和 Redis-server. 20 | > 2. MySQL中应有名为kybsrc的数据库,且该数据库包含一个名为posts的表,拥有num(INT AUTO_INCREMENT)和post(TEXT)两个字段。 21 | 22 | ## 如何启动 23 | 24 | #### 0. 先配置好各模块所引用的配置文件 25 | 26 | #### 1. 为了更好地运行,cooperator/start.py 应提前开始并完成一次工作函数执行 27 | > 第一次执行完后,每五分钟运行一次工作函数 28 | 29 | #### 2. 启动 master/start.py 30 | > 默认只执行一次 31 | 32 | #### 3. 启动 worker/start.py 33 | > 默认循环监听是否有新的URL待解析 34 | 35 | ## 核心点说明 36 | #### 1. 通过Redis的集合类型进行代理IP和URL的传递 37 | 38 | ```python 39 | # Summary Reference 40 | # --------- 41 | # 创建句柄 42 | def make_redis_handler(): 43 | pool = redis.ConnectionPool(host=r_server['ip'], port=r_server['port'], password=r_server['passwd']) 44 | return redis.Redis(connection_pool=pool) 45 | 46 | # 获得句柄 47 | def make_proxy_handler(): 48 | return make_redis_handler() 49 | 50 | # 保存到指定的set下 51 | def check_and_save(self, proxy): 52 | 'pass' 53 | self.redis_handler.sadd(r_server['s_name'], proxy) 54 | ``` 55 | #### 2. 由于在验证代理IP和使用封装的get_url()函数的时候网络IO较多,所以使用多线程(效果还是很明显的)。 56 | 57 | ```python 58 | #Summary Reference 59 | #--------- 60 | def save_proxy_ip(self): 61 | 'pass' 62 | for proxy in self.proxy_ip: 63 | Thread(target=self.check_and_save, args=(proxy,)).start() 64 | 65 | def get_url(url): 66 | 'pass' 67 | while True: 68 | 'pass' 69 | resp = request('get', url, headers=headers, proxies={'http': proxy}) 70 | 'pass' 71 | ``` 72 | 73 | ## 项目地址 74 | #### https://github.com/A1014280203/Ugly-Distributed-Crawler 75 | -------------------------------------------------------------------------------- /cooperator/CoperProxy.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | import requests 3 | from requests.exceptions import ProxyError 4 | import redis 5 | from threading import Thread 6 | from config import r_server 7 | 8 | 9 | class ProxyDemo(object): 10 | 11 | def __init__(self): 12 | print("begin") 13 | print('create instance start') 14 | self.query_address = 'http://www.ip.cn/' 15 | self.proxy_address = ['http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nt/'] 16 | self.primary_ip = '' 17 | self.proxy_ip = list() 18 | self.headers = { 19 | 'User-Agent': 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87', 20 | 'Referer': '' 21 | } 22 | self.redis_handler = self.connect_redis_server() 23 | print("create instance stop") 24 | 25 | def connect_redis_server(self): 26 | pool = redis.ConnectionPool(host=r_server['ip'], port=r_server['port'], password=['passwd']) 27 | return redis.Redis(connection_pool=pool) 28 | 29 | def set_primary_ip(self): 30 | print('set_primary_ip start') 31 | self.headers['Referer'] = self.query_address 32 | html = PyQuery(self.query_address, encoding='utf8', headers=self.headers) 33 | result = html('code').eq(0).text() 34 | self.primary_ip = result 35 | print('set_primary_ip stop') 36 | 37 | def get_proxy_ip(self): 38 | print('get_proxy_ip start') 39 | for address in self.proxy_address: 40 | self.headers['Referer'] = address 41 | html = PyQuery(address, encoding='utf8', headers=self.headers) 42 | result = html('tr') 43 | del result[0] 44 | for i in result.items(): 45 | self.proxy_ip.append( 46 | i('td').eq(1).text() + ":" + i('td').eq(2).text()) 47 | print('get_proxy_ip stop') 48 | 49 | def check_and_save(self, proxy): 50 | self.headers['Referer'] = self.query_address 51 | try: 52 | resp = requests.get(self.query_address, proxies={'http': proxy}, headers=self.headers) 53 | html = PyQuery(resp.content.decode()) 54 | except ProxyError: 55 | print('Expired:', proxy) 56 | return 57 | except UnicodeDecodeError: 58 | return 59 | result = html('code').eq(0).text() 60 | if result != self.primary_ip: 61 | self.redis_handler.sadd(r_server['s_name'], proxy) 62 | 63 | def save_proxy_ip(self): 64 | print('check_and_save start') 65 | self.redis_handler.delete(r_server['s_name']) 66 | for proxy in self.proxy_ip: 67 | Thread(target=self.check_and_save, args=(proxy,)).start() 68 | print('check_and_save stop') 69 | 70 | def start(self): 71 | self.set_primary_ip() 72 | self.get_proxy_ip() 73 | self.save_proxy_ip() 74 | print("end") 75 | 76 | if __name__ == "__main__": 77 | ProxyDemo().start() 78 | -------------------------------------------------------------------------------- /cooperator/config.py: -------------------------------------------------------------------------------- 1 | # settings for connecting redis-server 2 | r_server = { 3 | 'ip': "1.2.3.4", 4 | 'port': '6379', 5 | 'passwd': '', 6 | # the name of set which stores proxies 7 | 's_name': 'proxy_ip' 8 | } 9 | 10 | # for time.sleep() 11 | settings = { 12 | 'interval': 5*60 13 | } 14 | -------------------------------------------------------------------------------- /cooperator/start.py: -------------------------------------------------------------------------------- 1 | from CoperProxy import ProxyDemo 2 | import time 3 | from config import settings 4 | 5 | # 首先配置好config文件 6 | while True: 7 | ProxyDemo().start() 8 | time.sleep(settings['interval']) 9 | -------------------------------------------------------------------------------- /master/MasterMain.py: -------------------------------------------------------------------------------- 1 | from .basic_func import get_url, make_redis_handler 2 | from lxml import etree 3 | from .config import settings, r_server 4 | from multiprocessing import Pool 5 | 6 | 7 | class MasterDemo(object): 8 | 9 | def __init__(self): 10 | self.index_url = 'http://bbs.kaoyan.com/' 11 | self.raw_block_url = dict() 12 | 13 | def get_block_url(self): 14 | print('get_block_url') 15 | resp = get_url(self.index_url) 16 | html = etree.HTML(resp.content.decode()) 17 | a_tags = html.xpath('//*[@id="category_173"]/table/tr[1]/td[position()<5]//a') 18 | for a in a_tags: 19 | self.raw_block_url[a.text] = a.attrib['href'][:-1] 20 | 21 | def save_url(self, values, uhandler): 22 | print('save_url') 23 | uhandler.sadd(r_server['s_url'], *values) 24 | 25 | def delivery_post_url_by_block(self, raw_url): 26 | print('delivery_post_url_by_block') 27 | uhandler = make_redis_handler() 28 | for i in range(0, settings['b_pages']): 29 | made_url = raw_url + str(i+1) 30 | resp = get_url(made_url) 31 | html = etree.HTML(resp.content.decode()) 32 | # 只记录符合阅读大于read,回复大于reply的 33 | raw_path = "//td[@class='num']/em[text()>{read_num}]/../a[text()>{reply_num}]/@href" 34 | made_path = raw_path.format(read_num=settings['read'], reply_num=settings['reply']) 35 | filtered_url_list = html.xpath(made_path) 36 | if len(filtered_url_list) < 1: 37 | print('Error: No data.@:' + made_url) 38 | return 39 | self.save_url(filtered_url_list, uhandler) 40 | 41 | def delivery_post_url(self): 42 | print('delivery_post_url') 43 | # 使用多进程启动 44 | p = Pool(5) 45 | for url in self.raw_block_url.values(): 46 | p.apply_async(self.delivery_post_url_by_block, args=(url,)) 47 | p.close() 48 | p.join() 49 | 50 | def start(self): 51 | make_redis_handler().delete(r_server['s_url']) 52 | self.get_block_url() 53 | self.delivery_post_url() 54 | 55 | if __name__ == "__main__": 56 | MasterDemo().start() 57 | -------------------------------------------------------------------------------- /master/basic_func.py: -------------------------------------------------------------------------------- 1 | from requests.api import request 2 | from requests.exceptions import ProxyError 3 | import redis 4 | from .config import headers, settings, r_server 5 | 6 | 7 | def make_redis_handler(): 8 | pool = redis.ConnectionPool(host=r_server['ip'], port=r_server['port'], 9 | password=r_server['passwd']) 10 | return redis.Redis(connection_pool=pool) 11 | 12 | 13 | def make_proxy_handler(): 14 | return make_redis_handler() 15 | 16 | 17 | def get_proxy(): 18 | phandler = make_proxy_handler() 19 | return phandler.srandmember(r_server['s_proxy'], 1)[0].decode() 20 | 21 | 22 | def get_url(url): 23 | headers['Referer'] = url 24 | count = 0 25 | while True: 26 | count += 1 27 | if count < settings['maxtries']: 28 | proxy = get_proxy() 29 | else: 30 | proxy = None 31 | try: 32 | resp = request('get', url, headers=headers, proxies={'http': proxy}) 33 | return resp 34 | except ProxyError: 35 | if count > settings['maxtries']+2: 36 | print('Exit: Can not get url.<@get_url>') 37 | exit(1) 38 | continue 39 | 40 | if __name__ == '__main__': 41 | get_url('http://bbs.kaoyan.com') -------------------------------------------------------------------------------- /master/config.py: -------------------------------------------------------------------------------- 1 | headers = { 2 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/' 3 | '537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 4 | 'Referer': '' 5 | } 6 | 7 | r_server = { 8 | 'ip': 'localhost', 9 | 'port': '6379', 10 | 'passwd': '', 11 | 's_proxy': 'proxy_ip', 12 | # the name of set which stores url of posts 13 | 's_url': 'url' 14 | } 15 | 16 | settings = { 17 | # 使用代理时最大尝试次数 18 | 'maxtries': 3, 19 | # 每个版块遍历的页数 20 | 'b_pages': 5, 21 | # 合格的回复下限 22 | 'reply': 45, 23 | # 合格的阅读下限 24 | 'read': 10000, 25 | } 26 | -------------------------------------------------------------------------------- /master/start.py: -------------------------------------------------------------------------------- 1 | from .MasterMain import MasterDemo 2 | 3 | # 首先配置好config文件 4 | MasterDemo().start() 5 | -------------------------------------------------------------------------------- /worker/WorkerMain.py: -------------------------------------------------------------------------------- 1 | from .basic_func import make_redis_handler, get_url, pop_url 2 | from .filters import is_poll, is_locked, text_length_is_short, \ 3 | need_to_login, auth_reply_over_max, clear_text 4 | from pyquery import PyQuery 5 | from lxml import etree 6 | from .db import db_add 7 | import threading 8 | import time 9 | 10 | 11 | class WorkerDemo(object): 12 | 13 | def __init__(self): 14 | self.uhandler = make_redis_handler() 15 | 16 | def check_post(self, html): 17 | if is_locked(html): 18 | return False 19 | if is_poll(html): 20 | return False 21 | if text_length_is_short(html): 22 | return False 23 | if need_to_login(html): 24 | return False 25 | return auth_reply_over_max(html) 26 | 27 | def get_text_of_post(self, url): 28 | resp = get_url(url) 29 | flag = self.check_post(etree.HTML(resp.content.decode())) 30 | if flag: 31 | doc = PyQuery(resp.content.decode()) 32 | td_tags = doc('td').filter('.t_f') 33 | del td_tags[flag:] 34 | text = '' 35 | for td in td_tags.items(): 36 | text += clear_text(td.text()) 37 | return text 38 | 39 | def save_single_post(self, url): 40 | text = self.get_text_of_post(url) 41 | db_add(text) 42 | 43 | def save_all_posts(self): 44 | while True: 45 | url = pop_url() 46 | if not url: 47 | time.sleep(3) 48 | continue 49 | threading.Thread(target=self.save_single_post, args=(url,)).start() 50 | 51 | def start(self): 52 | self.save_all_posts() 53 | 54 | if __name__ == '__main__': 55 | WorkerDemo().get_text_of_post('http://bbs.kaoyan.com/t7729132p1') -------------------------------------------------------------------------------- /worker/basic_func.py: -------------------------------------------------------------------------------- 1 | from requests.api import request 2 | from requests.exceptions import ProxyError 3 | import redis 4 | from .config import headers, settings, r_server 5 | 6 | 7 | def make_redis_handler(): 8 | pool = redis.ConnectionPool(host=r_server['ip'], port=r_server['port'], 9 | password=r_server['passwd']) 10 | return redis.Redis(connection_pool=pool) 11 | 12 | 13 | def make_proxy_handler(): 14 | return make_redis_handler() 15 | 16 | 17 | def get_proxy(): 18 | phandler = make_proxy_handler() 19 | return phandler.srandmember('proxy_ip', 1)[0].decode() 20 | 21 | 22 | def pop_url(): 23 | uhandler = make_redis_handler() 24 | return uhandler.spop('url').decode() 25 | 26 | 27 | def get_url(url): 28 | headers['Referer'] = url 29 | count = 0 30 | while True: 31 | count += 1 32 | if count < settings['maxtries']: 33 | proxy = get_proxy() 34 | else: 35 | proxy = None 36 | try: 37 | resp = request('get', url, headers=headers, proxies={'http': proxy}) 38 | return resp 39 | except ProxyError: 40 | if count > settings['maxtries']+2: 41 | print('Exit: Could not get url.<@get_url>') 42 | exit(1) 43 | continue 44 | 45 | if __name__ == '__main__': 46 | get_url('http://bbs.kaoyan.com') -------------------------------------------------------------------------------- /worker/config.py: -------------------------------------------------------------------------------- 1 | headers = { 2 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/' 3 | '537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 4 | 'Referer': '' 5 | } 6 | 7 | r_server = { 8 | 'ip': '1.2.3.4', 9 | 'port': '6379', 10 | 'passwd': '', 11 | 's_proxy': 'proxy_ip', 12 | 's_url': 'url' 13 | } 14 | 15 | # settings for connecting database 16 | d_server = { 17 | 'user': 'user', 18 | 'passwd': 'password', 19 | 'addr': 'localhost' 20 | } 21 | 22 | settings = { 23 | 'p_pages': 4 24 | } 25 | 26 | filters = { 27 | # 文本最少字符数 28 | 'txt-len': 100, 29 | # 允许楼主在第一页最大回复数 re-max/10 30 | 're-max': 5 31 | } 32 | -------------------------------------------------------------------------------- /worker/db.py: -------------------------------------------------------------------------------- 1 | from config import d_server 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy import Column, Integer, Text 5 | from sqlalchemy.ext.declarative import declarative_base 6 | Base = declarative_base() 7 | 8 | 9 | class Post(Base): 10 | __tablename__ = 'posts' 11 | num = Column(Integer, primary_key=True, autoincrement=True) 12 | post = Column(Text) 13 | 14 | 15 | def make_db_session(): 16 | param = 'mysql+pymysql://{user}:{pw}@{addr}/kybsrc?charset=utf8'.format(user=d_server['user'], 17 | pw=d_server['passwd'], 18 | addr=d_server['addr']) 19 | engine = create_engine(param) 20 | db_session = sessionmaker(bind=engine) 21 | session = db_session() 22 | return session 23 | 24 | 25 | def db_add(text): 26 | db = make_db_session() 27 | db.add(Post(post=text)) 28 | db.commit() 29 | db.close() 30 | 31 | if __name__ == '__main__': 32 | t = 'content of post' 33 | db_add(t) 34 | -------------------------------------------------------------------------------- /worker/filters.py: -------------------------------------------------------------------------------- 1 | from config import filters 2 | from pyquery import PyQuery 3 | import re 4 | 5 | 6 | def is_locked(html): 7 | # 判断楼主是否被禁言或者需要回复查看 8 | doc = PyQuery(html) 9 | if doc('div').filter('.locked').length: 10 | return True 11 | return False 12 | 13 | 14 | def is_poll(html): 15 | # 判断是不是投票贴 16 | doc = PyQuery(html) 17 | if doc('form').filter('#poll').length: 18 | return True 19 | return False 20 | 21 | 22 | def need_to_login(html): 23 | # 是否需要登陆后查看 24 | result = html.xpath("//div/h3") 25 | if len(result): 26 | return True 27 | return False 28 | 29 | 30 | def text_length_is_short(html): 31 | # 判断首楼内容是不是过短 32 | result = html.xpath("//div[@id='postlist']/div[1]//td[@class='t_f']//text()") 33 | if len(''.join(result)) < filters['txt-len']: 34 | return True 35 | return False 36 | 37 | 38 | def auth_reply_over_max(html): 39 | # 判断楼主在第一页中是否回复太多 40 | raw_auth_href_list = html.xpath("//div[@class='pi']/div[@class='authi']/a/@href") 41 | try: 42 | basic_href = raw_auth_href_list[0] 43 | except Exception: 44 | return None 45 | count = 0 46 | count_max = filters['re-max'] 47 | for href in raw_auth_href_list: 48 | if basic_href == href: 49 | count += 1 50 | if count > count_max: 51 | return None 52 | if basic_href == raw_auth_href_list[1]: 53 | return 2 54 | else: 55 | return 1 56 | 57 | 58 | def clear_text(text=''): 59 | t_text = text.replace(r'\xa0', ' ') 60 | t_text = t_text.replace('下载附件', '') 61 | t_text = re.subn('\(.* Bytes, 下载次数: .*\)', '', t_text)[0] 62 | t_text = re.subn('\d*-\d*-\d* \d*:\d* 上传', '', t_text)[0] 63 | t_text = re.subn('.*\.png\s', '', t_text)[0] 64 | return t_text.strip() 65 | -------------------------------------------------------------------------------- /worker/start.py: -------------------------------------------------------------------------------- 1 | from .WorkerMain import WorkerDemo 2 | 3 | 4 | WorkerDemo().start() 5 | 6 | 7 | --------------------------------------------------------------------------------