├── .gitignore ├── LICENSE ├── README.md └── zhihu ├── scrapy.cfg └── zhihu ├── __init__.py ├── __pycache__ ├── __init__.cpython-35.pyc ├── cookie.cpython-35.pyc ├── items.cpython-35.pyc ├── middlewares.cpython-35.pyc ├── pipelines.cpython-35.pyc ├── proxy.cpython-35.pyc ├── settings.cpython-35.pyc ├── user_agents_pc.cpython-35.pyc └── yumdama.cpython-35.pyc ├── cookie.py ├── items.py ├── middlewares.py ├── pipelines.py ├── proxy.py ├── scrapy_redis ├── BloomfilterOnRedis.py ├── __init__.py ├── connection.py ├── defaults.py ├── dupefilter.py ├── picklecompat.py ├── pipelines.py ├── queue.py ├── scheduler.py ├── spiders.py ├── tests.py └── utils.py ├── settings.py ├── spiders ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── zhihuspider.cpython-35.pyc │ ├── zhihuspider0.cpython-35.pyc │ └── zhihuspider1.cpython-35.pyc ├── zhihuspider.py └── zhihuspider0.py ├── user_agents_pc.py └── yumdama.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 AlexTan-b-z 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据) 2 | *** 3 | *版本*:2.0 4 | *作者*: AlexTan 5 | 6 | *CSDN*: [AlexTan_](http://blog.csdn.net/alextan_) 7 | *E-Mail* : 8 | *** 9 | 10 | ## 原文博客:[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068) 11 | 12 | 13 | 14 | ## 更新日志: 15 | * 2017.12.18:v2.0版本,修改spider,解决了 爬虫运行过久由于一些特殊原因把redis里的待爬取requests队列里的Request都耗尽,从而导致重新运行爬虫时start_requests里的request都被dupefilter过滤掉 的问题。 16 | 17 | * 2017.11.21:v2.0版本 对proxy.py进行了优化,使每个ip的权值都不会超过10,避免出现有的ip权值无限增长,失效后要等很久才能删掉失效ip的问题。 18 | 19 | * 2017.10.08: v2.0版本 对ip代理池(中间件)进行了优化(知乎爬虫用不上,这个中间件可以移植到其他爬虫去,只对知乎爬虫有需求的可以无视),由于上次那个代理ip过期了,这次用的讯代理,感觉比上次那个代理好用多了,有效率在95%左右。但是缺点就是优质版每次只能提取20个,每天最多提取1000个。以前那个换ip的代码会误删很多并没有失效的ip,所以这次代码就对ip进行了加权(status)处理。默认权值为10,一次访问失败会减一,访问成功会加一,当权值小于1的时候,删除该ip。 20 | 21 | * 2017.08.22: 对三个版本的 pipline 和 spider 两个文件都修改了一下。因为以前RelationItem插入mongo时,next的数据会随机插入到粉丝或者关注里,导致数据会发生错误。 现已修正。同时,有人说到如果启用代理ip,获取ip那儿会造成堵塞,这次在获取代理ip那儿加了个多线程,解决了堵塞问题。 22 | 23 | * 2017.08.17: v2.0版本 对scrapy_redis进行优化,修改了scrapy-redis的去重机制(加了布隆过滤器)。更新原因: v1.0版本运行两到三天就会把内存(16G的服务器)占满。 更新后,V2.0版本,运行3天,只会占大概2到3G内存(几乎不会增长)。 24 | 25 | 26 | ## 关于redis: 27 | 如果要持久运行,建议修改一下redis.conf文件,ubuntu默认在 `/etc/redis/redis.conf` 下: 28 | 1. 把 maxmemory 设置成你内存的 3/4 29 | 2. 把 maxmemory-policy 设置成 allkeys-lru 30 | 31 | ### 最后建议多弄几个账号运行,目测78个就足够了。 32 | 33 | 34 | ## 原文博客:[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068) 35 | 36 | 37 | *** 38 | 39 | 最后,欢迎大家提出问题,共同学习!!! 40 | -------------------------------------------------------------------------------- /zhihu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihu 12 | -------------------------------------------------------------------------------- /zhihu/zhihu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__init__.py -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/cookie.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/cookie.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/middlewares.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/middlewares.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/proxy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/proxy.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/user_agents_pc.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/user_agents_pc.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/__pycache__/yumdama.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/__pycache__/yumdama.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/cookie.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import pdb 3 | import os 4 | import time 5 | import json 6 | from selenium import webdriver 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 8 | import logging 9 | from .yumdama import identify 10 | 11 | # ------------------------------------------ 12 | # 版本:1.0 13 | # 日期:2017-8-06 14 | # 作者:AlexTan 15 | # 16 | # 17 | # ------------------------------------------ 18 | 19 | dcap = dict(DesiredCapabilities.PHANTOMJS) 20 | dcap["phantomjs.page.settings.userAgent"] = ( 21 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36" 22 | ) 23 | logger = logging.getLogger(__name__) 24 | logging.getLogger("selenium").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING,太烦人 25 | 26 | METHOD = 0 #0代表手动输入验证码,1代表云打码 27 | 28 | myZhiHu = [ 29 | ('account','password',0), #0代表账号为手机,1代表账号为邮箱 30 | ] 31 | 32 | def getCookie(account,password,way): 33 | if way == 0: 34 | loginURL = "https://www.zhihu.com/login/phone_num" 35 | username = 'phone_num' 36 | else: 37 | loginURL = "https://www.zhihu.com/login/email" 38 | username = 'email' 39 | try: 40 | browser = webdriver.PhantomJS(desired_capabilities=dcap) 41 | #browser = webdriver.Firefox() 42 | browser.set_window_size(1920, 1080) 43 | browser.get("https://www.zhihu.com/explore") 44 | time.sleep(1) 45 | #pdb.set_trace() 46 | browser.find_element_by_class_name('switch-to-login').click() 47 | loginDIV = browser.find_element_by_id('SidebarSignFlow').find_element_by_class_name('LoginForm') 48 | loginDIV.find_element_by_name('account').send_keys(account) 49 | loginDIV.find_element_by_name('password').send_keys(password) 50 | time.sleep(1) 51 | while True: 52 | browser.save_screenshot("zhihu.png") 53 | if loginDIV.find_element_by_class_name('captcha-module').get_attribute('style') != '': 54 | if METHOD == 0: 55 | code_txt = input("请查看路径下新生成的zhihu.png,然后输入验证码:") 56 | else: 57 | img = loginDIV.find_element_by_class_name('captcha') 58 | x = img.location["x"] 59 | y = img.location["y"] 60 | from PIL import Image 61 | im = Image.open("zhihu.png") 62 | im.crop((x, y, 85 + x, y + 30)).save("captcha.png") 63 | #pdb.set_trace() 64 | code_txt = identify() 65 | loginDIV.find_element_by_name('captcha').send_keys(code_txt) 66 | loginDIV.find_element_by_class_name('zg-btn-blue').click() 67 | time.sleep(3) 68 | try: 69 | loginDIV.find_element_by_class_name('error') 70 | logger.warning("验证码或账号密码错误 %s!" % account) 71 | except: 72 | break 73 | try: 74 | #pdb.set_trace() 75 | browser.find_element_by_class_name('top-nav-profile') 76 | cookie = {} 77 | for elem in browser.get_cookies(): 78 | cookie[elem["name"]] = elem["value"] 79 | logger.warning("Get Cookie Success!( Account:%s )" % account) 80 | #pdb.set_trace() 81 | return json.dumps(cookie) 82 | except Exception: 83 | logger.warning("Failed %s!" % account) 84 | return "" 85 | except Exception: 86 | logger.warning("Failed %s!" % account) 87 | return "" 88 | finally: 89 | try: 90 | browser.quit() 91 | except Exception: 92 | pass 93 | 94 | def UpdateCookie(account,cookie): 95 | browser = webdriver.PhantomJS(desired_capabilities=dcap) 96 | #browser = webdriver.Firefox() 97 | browser.set_window_size(1920, 1080) 98 | browser.get('https://www.zhihu.com') 99 | browser.delete_all_cookies() 100 | send_cookie = [] 101 | for key,value in cookie.items(): 102 | one = {} 103 | one = {'domain':'.zhihu.com','name':key,'value':value,'path':'/','expiry':None} 104 | #pdb.set_trace() 105 | browser.add_cookie({k: one[k] for k in ('name', 'value', 'domain', 'path', 'expiry')}) 106 | #one = {'domain':'.zhihu.com','name':key,'value':value} 107 | #send_cookie.append(one) 108 | #browser.add_cookie(send_cookie) 109 | browser.get('https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84') 110 | time.sleep(1) 111 | browser.save_screenshot("update.png") 112 | if METHOD == 0: 113 | code_txt = input("请查看路径下新生成的update.png,然后输入验证码:") 114 | else: 115 | img = browser.find_element_by_class_name('Unhuman-captcha') 116 | x = img.location["x"] 117 | y = img.location["y"] 118 | from PIL import Image 119 | im = Image.open("zhihu.png") 120 | im.crop((x, y, 85 + x, y + 30)).save("captcha.png") 121 | #pdb.set_trace() 122 | code_txt = identify() 123 | browser.find_element_by_class_name('Input').send_keys(code_txt) 124 | browser.find_element_by_class_name('Button--blue').click() 125 | time.sleep(3) 126 | try: 127 | browser.find_element_by_class_name('AppHeader-profile') 128 | cookie = {} 129 | for elem in browser.get_cookies(): 130 | cookie[elem["name"]] = elem["value"] 131 | logger.warning("Update Cookie Success!( Account:%s )" % account) 132 | #pdb.set_trace() 133 | return json.dumps(cookie) 134 | except Exception: 135 | logger.warning("Update Failed %s!" % account) 136 | return "" 137 | finally: 138 | try: 139 | browser.quit() 140 | except Exception: 141 | pass 142 | 143 | 144 | 145 | def initCookie(rconn, spiderName): 146 | """ 获取所有账号的Cookies,存入Redis。如果Redis已有该账号的Cookie,则不再获取。 """ 147 | for zhihu in myZhiHu: 148 | if rconn.get("%s:Cookies:%s--%s" % (spiderName, zhihu[0], zhihu[1])) is None: # 'zhihuspider:Cookies:账号--密码',为None即不存在。 149 | cookie = getCookie(zhihu[0], zhihu[1],zhihu[2]) 150 | if len(cookie) > 0: 151 | rconn.set("%s:Cookies:%s--%s" % (spiderName, zhihu[0], zhihu[1]), cookie) 152 | cookieNum = str(rconn.keys()).count("zhihuspider:Cookies") 153 | logger.warning("The num of the cookies is %s" % cookieNum) 154 | if cookieNum == 0: 155 | logger.warning('Stopping...') 156 | os.system("pause") 157 | 158 | def updateCookie(accountText, rconn, spiderName, cookie): 159 | """ 更新一个账号的Cookie """ 160 | account = accountText.split("--")[0] 161 | #pdb.set_trace() 162 | new_cookie = UpdateCookie(account, cookie) 163 | if len(new_cookie) > 0: 164 | logger.warning("The cookie of %s has been updated successfully!" % account) 165 | rconn.set("%s:Cookies:%s" % (spiderName, accountText), new_cookie) 166 | else: 167 | logger.warning("The cookie of %s updated failed! Remove it!" % accountText) 168 | removeCookie(accountText, rconn, spiderName) 169 | 170 | def removeCookie(accountText, rconn, spiderName): 171 | """ 删除某个账号的Cookie """ 172 | rconn.delete("%s:Cookies:%s" % (spiderName, accountText)) 173 | cookieNum = str(rconn.keys()).count("zhihuspider:Cookies") 174 | logger.warning("The num of the cookies left is %s" % cookieNum) 175 | if cookieNum == 0: 176 | logger.warning("Stopping...") 177 | os.system("pause") 178 | 179 | 180 | if __name__ == '__main__': 181 | getCookie(myZhiHu[0][0],myZhiHu[0][1],myZhiHu[0][2]) 182 | -------------------------------------------------------------------------------- /zhihu/zhihu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # ------------------------------------------ 4 | # 版本:1.0 5 | # 日期:2017-8-06 6 | # 作者:AlexTan 7 | # 8 | # 9 | # ------------------------------------------ 10 | 11 | import scrapy 12 | 13 | 14 | class ZhihuItem(scrapy.Item): 15 | # define the fields for your item here like: 16 | # name = scrapy.Field() 17 | user_id = scrapy.Field() 18 | user_image_url = scrapy.Field() 19 | name = scrapy.Field() 20 | locations = scrapy.Field() 21 | business = scrapy.Field() #所在行业 22 | employments = scrapy.Field() #职业经历 23 | gender = scrapy.Field() 24 | education = scrapy.Field() 25 | followees_num = scrapy.Field() #我关注的人数 26 | followers_num = scrapy.Field() #关注我的人数 27 | 28 | class RelationItem(scrapy.Item): 29 | user_id = scrapy.Field() 30 | relation_type = scrapy.Field() #关系类型 31 | relations_id = scrapy.Field() 32 | 33 | class AnswerItem(scrapy.Item): 34 | answer_user_id = scrapy.Field() 35 | answer_id = scrapy.Field() 36 | question_id = scrapy.Field() 37 | cretated_time = scrapy.Field() 38 | updated_time = scrapy.Field() 39 | voteup_count = scrapy.Field() 40 | comment_count = scrapy.Field() 41 | content = scrapy.Field() 42 | 43 | class QuestionItem(scrapy.Item): 44 | ask_user_id = scrapy.Field() 45 | question_id = scrapy.Field() 46 | ask_time = scrapy.Field() 47 | answer_count = scrapy.Field() 48 | followees_count = scrapy.Field() 49 | title = scrapy.Field() 50 | 51 | class ArticleItem(scrapy.Item): 52 | author_id = scrapy.Field() 53 | title = scrapy.Field() 54 | article_id = scrapy.Field() 55 | content = scrapy.Field() 56 | cretated_time = scrapy.Field() 57 | updated_time = scrapy.Field() 58 | voteup_count = scrapy.Field() 59 | comment_count = scrapy.Field() -------------------------------------------------------------------------------- /zhihu/zhihu/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import telnetlib 5 | import random 6 | import redis 7 | import json 8 | import os 9 | import threading 10 | import pdb 11 | from scrapy import signals 12 | from .user_agents_pc import agents 13 | from .proxy import initIPPOOLS, updateIPPOOLS 14 | from .cookie import initCookie, updateCookie, removeCookie 15 | from scrapy.utils.response import response_status_message 16 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 17 | from scrapy.exceptions import IgnoreRequest 18 | 19 | # ------------------------------------------ 20 | # 版本:1.0 21 | # 日期:2017-8-06 22 | # 作者:AlexTan 23 | # 24 | # 25 | # ------------------------------------------ 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | class UserAgentMiddleware(object): 30 | """ 换User-Agent """ 31 | 32 | def process_request(self, request, spider): 33 | agent = random.choice(agents) 34 | request.headers["User-Agent"] = agent 35 | 36 | class ProxyMiddleware(RetryMiddleware): 37 | '''IP代理''' 38 | def __init__(self, settings, crawler): 39 | #自己获取的ip 40 | self.TIMES = 10 41 | RetryMiddleware.__init__(self, settings) 42 | self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) 43 | #initIPPOOLS(self.rconn) 44 | 45 | @classmethod 46 | def from_crawler(cls, crawler): 47 | return cls(crawler.settings, crawler) 48 | 49 | def process_request(self,request,spider): 50 | #pdb.set_trace() 51 | ipNum=len(self.rconn.keys('IP*')) 52 | #pdb.set_trace() 53 | if ipNum<50: 54 | proxy_thread = threading.Thread(target= initIPPOOLS,args = (self.rconn,)) 55 | proxy_thread.setDaemon(True) 56 | proxy_thread.start() 57 | #initIPPOOLS(self.rconn) 58 | if self.TIMES == 3: 59 | baseIP=random.choice(self.rconn.keys('IP:*')) 60 | ip=str(baseIP,'utf-8').replace('IP:','') 61 | try: 62 | IP,PORT,status=ip.split(':') 63 | request.meta['status'] = status 64 | telnetlib.Telnet(IP,port=PORT,timeout=2) #测试ip是否有效 65 | except: 66 | logger.warning("The ip is not available !( IP:%s )" % ip) 67 | updateIPPOOLS(self.rconn,IP+':'+PORT,status) 68 | else: 69 | #pdb.set_trace() 70 | self.IP = "http://" + IP + ':' + PORT 71 | logger.warning("The current IP is %s!" % self.IP) 72 | self.TIMES = 0 73 | updateIPPOOLS(self.rconn,IP+':'+PORT,status,1) 74 | #pdb.set_trace() 75 | else: 76 | self.TIMES += 1 77 | #pdb.set_trace() 78 | if self.IP is not "": 79 | request.meta["proxy"] = self.IP 80 | 81 | def process_response(self,request,response,spider): 82 | if response.status in [400,403,404,429,500,502,503,504]: 83 | self.TIMES = 3 84 | logger.error("%s! error..." % response.status) 85 | #pdb.set_trace() 86 | try: 87 | updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1) 88 | except: 89 | pass 90 | reason = response_status_message(response.status) 91 | return self._retry(request, reason, spider) or response # 重试 92 | else: 93 | return response 94 | 95 | def process_exception(self, request, exception, spider): 96 | #pdb.set_trace() 97 | self.TIMES = 3 98 | try: 99 | updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1) 100 | except: 101 | pass 102 | return request 103 | 104 | class CookiesMiddleware(RetryMiddleware): 105 | """ 维护Cookie """ 106 | 107 | def __init__(self, settings, crawler): 108 | RetryMiddleware.__init__(self, settings) 109 | self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) 110 | initCookie(self.rconn, crawler.spider.name) 111 | 112 | @classmethod 113 | def from_crawler(cls, crawler): 114 | return cls(crawler.settings, crawler) 115 | 116 | def process_request(self, request, spider): 117 | redisKeys = self.rconn.keys() 118 | while len(redisKeys) > 0: 119 | elem = random.choice(redisKeys) 120 | #pdb.set_trace() 121 | if b'zhihuspider:Cookies' in elem: 122 | #pdb.set_trace() 123 | elem = str(elem,'utf-8') 124 | cookie = json.loads(str(self.rconn.get(elem),'utf-8')) 125 | request.cookies = cookie 126 | request.meta["accountText"] = elem.split("Cookies:")[-1] 127 | break 128 | else: 129 | #pdb.set_trace() 130 | redisKeys.remove(elem) 131 | 132 | def process_response(self, request, response, spider): 133 | #pdb.set_trace() 134 | reason = response_status_message(response.status) 135 | if response.status in [300, 301, 302, 303]: 136 | pdb.set_trace() 137 | if reason == '301 Moved Permanently': 138 | return self._retry(request, reason, spider) or response # 重试 139 | else: 140 | raise IgnoreRequest 141 | elif response.status in [403, 414]: 142 | logger.error("%s! Stopping..." % response.status) 143 | os.system("pause") 144 | updateCookie(request.meta['accountText'], self.rconn, spider.name, request.cookies) 145 | return self._retry(request, reason, spider) or response # 重试 146 | else: 147 | return response 148 | -------------------------------------------------------------------------------- /zhihu/zhihu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pymongo 3 | import pdb 4 | from .items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem 5 | 6 | # ------------------------------------------ 7 | # 版本:1.0 8 | # 日期:2017-8-06 9 | # 作者:AlexTan 10 | # 11 | # 12 | # ------------------------------------------ 13 | 14 | 15 | class ZhihuPipeline(object): 16 | def __init__(self, mongo_uri, mongo_db): 17 | self.mongo_uri = mongo_uri 18 | self.mongo_db = mongo_db 19 | 20 | @classmethod 21 | def from_crawler(cls,crawler): 22 | return cls( 23 | mongo_uri = crawler.settings.get('MONGO_URI'), 24 | mongo_db = crawler.settings.get('MONGO_DATABASE','zhihu') 25 | ) 26 | 27 | def open_spider(self,spider): 28 | self.client = pymongo.MongoClient(self.mongo_uri) 29 | self.db = self.client[self.mongo_db] 30 | 31 | def close_spider(self,spider): 32 | self.client.close() 33 | 34 | def process_item(self, item, spider): 35 | if isinstance(item, ZhihuItem): 36 | self._process_user_item(item) 37 | elif isinstance(item, AnswerItem): 38 | self._process_answer_item(item) 39 | elif isinstance(item, QuestionItem): 40 | self._process_question_item(item) 41 | elif isinstance(item, ArticleItem): 42 | self._process_article_item(item) 43 | else: 44 | #pdb.set_trace() 45 | self._process_relation_item(item) 46 | return item 47 | 48 | def _process_user_item(self,item): 49 | self.db.UserInfo.insert(dict(item)) 50 | 51 | def _process_relation_item(self,item): 52 | try: 53 | isnext,relation_type = item['relation_type'].split(':') 54 | if isnext == 'next': 55 | for one in item['relations_id']: 56 | #pdb.set_trace() 57 | self.db.Relation.update({'user_id':item['user_id'],'relation_type':relation_type},{"$push":{'relations_id':one}}) 58 | except: 59 | self.db.Relation.insert(dict(item)) 60 | 61 | def _process_answer_item(self,item): 62 | self.db.AnswerInfo.insert(dict(item)) 63 | 64 | def _process_question_item(self,item): 65 | self.db.QuestionInfo.insert(dict(item)) 66 | 67 | def _process_article_item(self,item): 68 | self.db.ArticleInfo.insert(dict(item)) 69 | -------------------------------------------------------------------------------- /zhihu/zhihu/proxy.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import telnetlib 3 | import urllib 4 | import logging 5 | 6 | # ------------------------------------------ 7 | # 版本:1.0 8 | # 日期:2017-8-06 9 | # 作者:AlexTan 10 | # 11 | # 12 | # ------------------------------------------ 13 | 14 | logger = logging.getLogger(__name__) 15 | IPPOOLNUM=20 #一次性从网页获取的IP数量 16 | 17 | def GetIPPOOLS(num): 18 | #大象代理买的ip,5元20000个,每十个差不多有一个能用 19 | IPPOOL=urllib.request.urlopen("http://tpv.daxiangdaili.com/ip/?tid=559480480576119&num="+str(num)+"&operator=1&filter=on&protocol=http&category=2&delay=1").read().decode("utf-8","ignore").split('\r\n') 20 | ''' 21 | #自己获取的ip 22 | IPPOOLS1=urllib.request.urlopen("http://127.0.0.1:8000/?types=0&count=20&country=%E5%9B%BD%E5%86%85").read().decode("utf-8",'ignore') 23 | IPPOOLS2=re.findall('\"(\d+\.\d+\.\d+\.\d+\"\,\s*\d+)',IPPOOLS1) 24 | IPPOOL=[i.replace('", ',':') for i in IPPOOLS2] 25 | ''' 26 | return IPPOOL 27 | 28 | def initIPPOOLS(rconn): 29 | """把有效的IP存入 REDIS数据库""" 30 | 31 | ipNum=len(rconn.keys('IP*')) 32 | if ipNum 8 | # 9 | # ------------------------------------------ 10 | 11 | 12 | class SimpleHash(object): 13 | def __init__(self, cap, seed): 14 | self.cap = cap 15 | self.seed = seed 16 | 17 | def hash(self, value): 18 | ret = 0 19 | for i in range(len(value)): 20 | ret += self.seed * ret + ord(value[i]) 21 | return (self.cap - 1) & ret 22 | 23 | 24 | class BloomFilter(object): 25 | def __init__(self, server, key, blockNum=1): 26 | self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M 27 | self.seeds = [5, 7, 11, 13, 31] 28 | # self.seeds = [5, 7, 11, 13, 31, 37, 61] 29 | self.server = server 30 | self.key = key 31 | self.blockNum = blockNum 32 | self.hashfunc = [] 33 | for seed in self.seeds: 34 | self.hashfunc.append(SimpleHash(self.bit_size, seed)) 35 | 36 | def isContains(self, str_input): 37 | if not str_input: 38 | return False 39 | ret = True 40 | 41 | name = self.key + str(int(str_input[0:2], 16) % self.blockNum) 42 | for f in self.hashfunc: 43 | loc = f.hash(str_input) 44 | ret = ret & self.server.getbit(name, loc) 45 | return ret 46 | 47 | def insert(self, str_input): 48 | name = self.key + str(int(str_input[0:2], 16) % self.blockNum) 49 | for f in self.hashfunc: 50 | loc = f.hash(str_input) 51 | self.server.setbit(name, loc, 1) -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/scrapy_redis/__init__.py -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import defaults 6 | 7 | 8 | # Shortcut maps 'setting name' -> 'parmater name'. 9 | SETTINGS_PARAMS_MAP = { 10 | 'REDIS_URL': 'url', 11 | 'REDIS_HOST': 'host', 12 | 'REDIS_PORT': 'port', 13 | 'REDIS_ENCODING': 'encoding', 14 | } 15 | 16 | 17 | def get_redis_from_settings(settings): 18 | """Returns a redis client instance from given Scrapy settings object. 19 | 20 | This function uses ``get_client`` to instantiate the client and uses 21 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 22 | can override them using the ``REDIS_PARAMS`` setting. 23 | 24 | Parameters 25 | ---------- 26 | settings : Settings 27 | A scrapy settings object. See the supported settings below. 28 | 29 | Returns 30 | ------- 31 | server 32 | Redis client instance. 33 | 34 | Other Parameters 35 | ---------------- 36 | REDIS_URL : str, optional 37 | Server connection URL. 38 | REDIS_HOST : str, optional 39 | Server host. 40 | REDIS_PORT : str, optional 41 | Server port. 42 | REDIS_ENCODING : str, optional 43 | Data encoding. 44 | REDIS_PARAMS : dict, optional 45 | Additional client parameters. 46 | 47 | """ 48 | params = defaults.REDIS_PARAMS.copy() 49 | params.update(settings.getdict('REDIS_PARAMS')) 50 | # XXX: Deprecate REDIS_* settings. 51 | for source, dest in SETTINGS_PARAMS_MAP.items(): 52 | val = settings.get(source) 53 | if val: 54 | params[dest] = val 55 | 56 | # Allow ``redis_cls`` to be a path to a class. 57 | if isinstance(params.get('redis_cls'), six.string_types): 58 | params['redis_cls'] = load_object(params['redis_cls']) 59 | 60 | return get_redis(**params) 61 | 62 | 63 | # Backwards compatible alias. 64 | from_settings = get_redis_from_settings 65 | 66 | 67 | def get_redis(**kwargs): 68 | """Returns a redis client instance. 69 | 70 | Parameters 71 | ---------- 72 | redis_cls : class, optional 73 | Defaults to ``redis.StrictRedis``. 74 | url : str, optional 75 | If given, ``redis_cls.from_url`` is used to instantiate the class. 76 | **kwargs 77 | Extra parameters to be passed to the ``redis_cls`` class. 78 | 79 | Returns 80 | ------- 81 | server 82 | Redis client instance. 83 | 84 | """ 85 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 86 | url = kwargs.pop('url', None) 87 | if url: 88 | return redis_cls.from_url(url, **kwargs) 89 | else: 90 | return redis_cls(**kwargs) 91 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | 4 | # For standalone use. 5 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 6 | 7 | PIPELINE_KEY = '%(spider)s:items' 8 | 9 | REDIS_CLS = redis.StrictRedis 10 | REDIS_ENCODING = 'utf-8' 11 | # Sane connection defaults. 12 | REDIS_PARAMS = { 13 | 'socket_timeout': 30, 14 | 'socket_connect_timeout': 30, 15 | 'retry_on_timeout': True, 16 | 'encoding': REDIS_ENCODING, 17 | } 18 | 19 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests' 20 | SCHEDULER_QUEUE_CLASS = 'zhihu.scrapy_redis.queue.PriorityQueue' 21 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 22 | SCHEDULER_DUPEFILTER_CLASS = 'zhihu.scrapy_redis.dupefilter.RFPDupeFilter' 23 | 24 | START_URLS_KEY = '%(name)s:start_urls' 25 | START_URLS_AS_SET = False 26 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import pdb 4 | from .BloomfilterOnRedis import BloomFilter 5 | from scrapy.dupefilters import BaseDupeFilter 6 | from scrapy.utils.request import request_fingerprint 7 | 8 | from . import defaults 9 | from .connection import get_redis_from_settings 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # TODO: Rename class to RedisDupeFilter. 16 | class RFPDupeFilter(BaseDupeFilter): 17 | """Redis-based request duplicates filter. 18 | 19 | This class can also be used with default Scrapy's scheduler. 20 | 21 | """ 22 | 23 | logger = logger 24 | 25 | def __init__(self, server, key, debug=False): 26 | """Initialize the duplicates filter. 27 | 28 | Parameters 29 | ---------- 30 | server : redis.StrictRedis 31 | The redis server instance. 32 | key : str 33 | Redis key Where to store fingerprints. 34 | debug : bool, optional 35 | Whether to log filtered requests. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.debug = debug 41 | self.bf = BloomFilter(server, key, blockNum=1) # you can increase blockNum if your are filtering too many urls 42 | self.logdupes = True 43 | 44 | @classmethod 45 | def from_settings(cls, settings): 46 | """Returns an instance from given settings. 47 | 48 | This uses by default the key ``dupefilter:``. When using the 49 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 50 | it needs to pass the spider name in the key. 51 | 52 | Parameters 53 | ---------- 54 | settings : scrapy.settings.Settings 55 | 56 | Returns 57 | ------- 58 | RFPDupeFilter 59 | A RFPDupeFilter instance. 60 | 61 | 62 | """ 63 | server = get_redis_from_settings(settings) 64 | # XXX: This creates one-time key. needed to support to use this 65 | # class as standalone dupefilter with scrapy's default scheduler 66 | # if scrapy passes spider on open() method this wouldn't be needed 67 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 68 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 69 | debug = settings.getbool('DUPEFILTER_DEBUG') 70 | return cls(server, key=key, debug=debug) 71 | 72 | @classmethod 73 | def from_crawler(cls, crawler): 74 | """Returns instance from crawler. 75 | 76 | Parameters 77 | ---------- 78 | crawler : scrapy.crawler.Crawler 79 | 80 | Returns 81 | ------- 82 | RFPDupeFilter 83 | Instance of RFPDupeFilter. 84 | 85 | """ 86 | return cls.from_settings(crawler.settings) 87 | 88 | def request_seen(self, request): 89 | """Returns True if request was already seen. 90 | 91 | Parameters 92 | ---------- 93 | request : scrapy.http.Request 94 | 95 | Returns 96 | ------- 97 | bool 98 | 99 | """ 100 | fp = request_fingerprint(request) 101 | if self.bf.isContains(fp): 102 | return True 103 | else: 104 | self.bf.insert(fp) 105 | return False 106 | 107 | def request_fingerprint(self, request): 108 | """Returns a fingerprint for a given request. 109 | 110 | Parameters 111 | ---------- 112 | request : scrapy.http.Request 113 | 114 | Returns 115 | ------- 116 | str 117 | 118 | """ 119 | return request_fingerprint(request) 120 | 121 | def close(self, reason=''): 122 | """Delete data on close. Called by Scrapy's scheduler. 123 | 124 | Parameters 125 | ---------- 126 | reason : str, optional 127 | 128 | """ 129 | self.clear() 130 | 131 | def clear(self): 132 | """Clears fingerprints data.""" 133 | self.server.delete(self.key) 134 | 135 | def log(self, request, spider): 136 | """Logs given request. 137 | 138 | Parameters 139 | ---------- 140 | request : scrapy.http.Request 141 | spider : scrapy.spiders.Spider 142 | 143 | """ 144 | if self.debug: 145 | msg = "Filtered duplicate request: %(request)s" 146 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 147 | elif self.logdupes: 148 | msg = ("Filtered duplicate request %(request)s" 149 | " - no more duplicates will be shown" 150 | " (see DUPEFILTER_DEBUG to show all duplicates)") 151 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 152 | self.logdupes = False 153 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | 8 | default_serialize = ScrapyJSONEncoder().encode 9 | 10 | 11 | class RedisPipeline(object): 12 | """Pushes serialized item into a redis list/queue 13 | 14 | Settings 15 | -------- 16 | REDIS_ITEMS_KEY : str 17 | Redis key where to store items. 18 | REDIS_ITEMS_SERIALIZER : str 19 | Object path to serializer function. 20 | 21 | """ 22 | 23 | def __init__(self, server, 24 | key=defaults.PIPELINE_KEY, 25 | serialize_func=default_serialize): 26 | """Initialize pipeline. 27 | 28 | Parameters 29 | ---------- 30 | server : StrictRedis 31 | Redis client instance. 32 | key : str 33 | Redis key where to store items. 34 | serialize_func : callable 35 | Items serializer function. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.serialize = serialize_func 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | params = { 45 | 'server': connection.from_settings(settings), 46 | } 47 | if settings.get('REDIS_ITEMS_KEY'): 48 | params['key'] = settings['REDIS_ITEMS_KEY'] 49 | if settings.get('REDIS_ITEMS_SERIALIZER'): 50 | params['serialize_func'] = load_object( 51 | settings['REDIS_ITEMS_SERIALIZER'] 52 | ) 53 | 54 | return cls(**params) 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | return cls.from_settings(crawler.settings) 59 | 60 | def process_item(self, item, spider): 61 | return deferToThread(self._process_item, item, spider) 62 | 63 | def _process_item(self, item, spider): 64 | key = self.item_key(item, spider) 65 | data = self.serialize(item) 66 | self.server.rpush(key, data) 67 | return item 68 | 69 | def item_key(self, item, spider): 70 | """Returns redis key based on given spider. 71 | 72 | Override this function to use a different key depending on the item 73 | and/or spider. 74 | 75 | """ 76 | return self.key % {'spider': spider.name} 77 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider base queue class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters 13 | ---------- 14 | server : StrictRedis 15 | Redis client instance. 16 | spider : Spider 17 | Scrapy spider instance. 18 | key: str 19 | Redis key where to put and get messages. 20 | serializer : object 21 | Serializer object with ``loads`` and ``dumps`` methods. 22 | 23 | """ 24 | if serializer is None: 25 | # Backward compatibility. 26 | # TODO: deprecate pickle. 27 | serializer = picklecompat 28 | if not hasattr(serializer, 'loads'): 29 | raise TypeError("serializer does not implement 'loads' function: %r" 30 | % serializer) 31 | if not hasattr(serializer, 'dumps'): 32 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 | % serializer) 34 | 35 | self.server = server 36 | self.spider = spider 37 | self.key = key % {'spider': spider.name} 38 | self.serializer = serializer 39 | 40 | def _encode_request(self, request): 41 | """Encode a request object""" 42 | obj = request_to_dict(request, self.spider) 43 | return self.serializer.dumps(obj) 44 | 45 | def _decode_request(self, encoded_request): 46 | """Decode an request previously encoded""" 47 | obj = self.serializer.loads(encoded_request) 48 | return request_from_dict(obj, self.spider) 49 | 50 | def __len__(self): 51 | """Return the length of the queue""" 52 | raise NotImplementedError 53 | 54 | def push(self, request): 55 | """Push a request""" 56 | raise NotImplementedError 57 | 58 | def pop(self, timeout=0): 59 | """Pop a request""" 60 | raise NotImplementedError 61 | 62 | def clear(self): 63 | """Clear queue/stack""" 64 | self.server.delete(self.key) 65 | 66 | 67 | class FifoQueue(Base): 68 | """Per-spider FIFO queue""" 69 | 70 | def __len__(self): 71 | """Return the length of the queue""" 72 | return self.server.llen(self.key) 73 | 74 | def push(self, request): 75 | """Push a request""" 76 | self.server.lpush(self.key, self._encode_request(request)) 77 | 78 | def pop(self, timeout=0): 79 | """Pop a request""" 80 | if timeout > 0: 81 | data = self.server.brpop(self.key, timeout) 82 | if isinstance(data, tuple): 83 | data = data[1] 84 | else: 85 | data = self.server.rpop(self.key) 86 | if data: 87 | return self._decode_request(data) 88 | 89 | 90 | class PriorityQueue(Base): 91 | """Per-spider priority queue abstraction using redis' sorted set""" 92 | 93 | def __len__(self): 94 | """Return the length of the queue""" 95 | return self.server.zcard(self.key) 96 | 97 | def push(self, request): 98 | """Push a request""" 99 | data = self._encode_request(request) 100 | score = -request.priority 101 | # We don't use zadd method as the order of arguments change depending on 102 | # whether the class is Redis or StrictRedis, and the option of using 103 | # kwargs only accepts strings, not bytes. 104 | self.server.execute_command('ZADD', self.key, score, data) 105 | 106 | def pop(self, timeout=0): 107 | """ 108 | Pop a request 109 | timeout not support in this queue class 110 | """ 111 | # use atomic range/remove using multi/exec 112 | pipe = self.server.pipeline() 113 | pipe.multi() 114 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 115 | results, count = pipe.execute() 116 | if results: 117 | return self._decode_request(results[0]) 118 | 119 | 120 | class LifoQueue(Base): 121 | """Per-spider LIFO queue.""" 122 | 123 | def __len__(self): 124 | """Return the length of the stack""" 125 | return self.server.llen(self.key) 126 | 127 | def push(self, request): 128 | """Push a request""" 129 | self.server.lpush(self.key, self._encode_request(request)) 130 | 131 | def pop(self, timeout=0): 132 | """Pop a request""" 133 | if timeout > 0: 134 | data = self.server.blpop(self.key, timeout) 135 | if isinstance(data, tuple): 136 | data = data[1] 137 | else: 138 | data = self.server.lpop(self.key) 139 | 140 | if data: 141 | return self._decode_request(data) 142 | 143 | # TODO: Deprecate the use of these names. 144 | SpiderQueue = FifoQueue 145 | SpiderStack = LifoQueue 146 | SpiderPriorityQueue = PriorityQueue 147 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | # TODO: add SCRAPY_JOB support. 10 | class Scheduler(object): 11 | """Redis-based scheduler 12 | 13 | Settings 14 | -------- 15 | SCHEDULER_PERSIST : bool (default: False) 16 | Whether to persist or clear redis queue. 17 | SCHEDULER_FLUSH_ON_START : bool (default: False) 18 | Whether to flush redis queue on start. 19 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 | How many seconds to wait before closing if no message is received. 21 | SCHEDULER_QUEUE_KEY : str 22 | Scheduler redis key. 23 | SCHEDULER_QUEUE_CLASS : str 24 | Scheduler queue class. 25 | SCHEDULER_DUPEFILTER_KEY : str 26 | Scheduler dupefilter redis key. 27 | SCHEDULER_DUPEFILTER_CLASS : str 28 | Scheduler dupefilter class. 29 | SCHEDULER_SERIALIZER : str 30 | Scheduler serializer. 31 | 32 | """ 33 | 34 | def __init__(self, server, 35 | persist=False, 36 | flush_on_start=False, 37 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 | idle_before_close=0, 42 | serializer=None): 43 | """Initialize scheduler. 44 | 45 | Parameters 46 | ---------- 47 | server : Redis 48 | The redis server instance. 49 | persist : bool 50 | Whether to flush requests when closing. Default is False. 51 | flush_on_start : bool 52 | Whether to flush requests on start. Default is False. 53 | queue_key : str 54 | Requests queue key. 55 | queue_cls : str 56 | Importable path to the queue class. 57 | dupefilter_key : str 58 | Duplicates filter key. 59 | dupefilter_cls : str 60 | Importable path to the dupefilter class. 61 | idle_before_close : int 62 | Timeout before giving up. 63 | 64 | """ 65 | if idle_before_close < 0: 66 | raise TypeError("idle_before_close cannot be negative") 67 | 68 | self.server = server 69 | self.persist = persist 70 | self.flush_on_start = flush_on_start 71 | self.queue_key = queue_key 72 | self.queue_cls = queue_cls 73 | self.dupefilter_cls = dupefilter_cls 74 | self.dupefilter_key = dupefilter_key 75 | self.idle_before_close = idle_before_close 76 | self.serializer = serializer 77 | self.stats = None 78 | 79 | def __len__(self): 80 | return len(self.queue) 81 | 82 | @classmethod 83 | def from_settings(cls, settings): 84 | kwargs = { 85 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 | } 89 | 90 | # If these values are missing, it means we want to use the defaults. 91 | optional = { 92 | # TODO: Use custom prefixes for this settings to note that are 93 | # specific to scrapy-redis. 94 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 | # We use the default setting name to keep compatibility. 98 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 | 'serializer': 'SCHEDULER_SERIALIZER', 100 | } 101 | for name, setting_name in optional.items(): 102 | val = settings.get(setting_name) 103 | if val: 104 | kwargs[name] = val 105 | 106 | # Support serializer as a path to a module. 107 | if isinstance(kwargs.get('serializer'), six.string_types): 108 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 | 110 | server = connection.from_settings(settings) 111 | # Ensure the connection is working. 112 | server.ping() 113 | 114 | return cls(server=server, **kwargs) 115 | 116 | @classmethod 117 | def from_crawler(cls, crawler): 118 | instance = cls.from_settings(crawler.settings) 119 | # FIXME: for now, stats are only supported from this constructor 120 | instance.stats = crawler.stats 121 | return instance 122 | 123 | def open(self, spider): 124 | self.spider = spider 125 | 126 | try: 127 | self.queue = load_object(self.queue_cls)( 128 | server=self.server, 129 | spider=spider, 130 | key=self.queue_key % {'spider': spider.name}, 131 | serializer=self.serializer, 132 | ) 133 | except TypeError as e: 134 | raise ValueError("Failed to instantiate queue class '%s': %s", 135 | self.queue_cls, e) 136 | 137 | try: 138 | self.df = load_object(self.dupefilter_cls)( 139 | server=self.server, 140 | key=self.dupefilter_key % {'spider': spider.name}, 141 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 142 | ) 143 | except TypeError as e: 144 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 145 | self.dupefilter_cls, e) 146 | 147 | if self.flush_on_start: 148 | self.flush() 149 | # notice if there are requests already in the queue to resume the crawl 150 | if len(self.queue): 151 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 152 | 153 | def close(self, reason): 154 | if not self.persist: 155 | self.flush() 156 | 157 | def flush(self): 158 | self.df.clear() 159 | self.queue.clear() 160 | 161 | def enqueue_request(self, request): 162 | if not request.dont_filter and self.df.request_seen(request): 163 | self.df.log(request, self.spider) 164 | return False 165 | if self.stats: 166 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 167 | self.queue.push(request) 168 | return True 169 | 170 | def next_request(self): 171 | block_pop_timeout = self.idle_before_close 172 | request = self.queue.pop(block_pop_timeout) 173 | if request and self.stats: 174 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 175 | return request 176 | 177 | def has_pending_requests(self): 178 | return len(self) > 0 -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection, defaults 6 | from .utils import bytes_to_str 7 | 8 | 9 | class RedisMixin(object): 10 | """Mixin class to implement reading urls from a redis queue.""" 11 | redis_key = None 12 | redis_batch_size = None 13 | redis_encoding = None 14 | 15 | # Redis client placeholder. 16 | server = None 17 | 18 | def start_requests(self): 19 | """Returns a batch of start requests from redis.""" 20 | return self.next_requests() 21 | 22 | def setup_redis(self, crawler=None): 23 | """Setup redis connection and idle signal. 24 | 25 | This should be called after the spider has set its crawler object. 26 | """ 27 | if self.server is not None: 28 | return 29 | 30 | if crawler is None: 31 | # We allow optional crawler argument to keep backwards 32 | # compatibility. 33 | # XXX: Raise a deprecation warning. 34 | crawler = getattr(self, 'crawler', None) 35 | 36 | if crawler is None: 37 | raise ValueError("crawler is required") 38 | 39 | settings = crawler.settings 40 | 41 | if self.redis_key is None: 42 | self.redis_key = settings.get( 43 | 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, 44 | ) 45 | 46 | self.redis_key = self.redis_key % {'name': self.name} 47 | 48 | if not self.redis_key.strip(): 49 | raise ValueError("redis_key must not be empty") 50 | 51 | if self.redis_batch_size is None: 52 | # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 53 | self.redis_batch_size = settings.getint( 54 | 'REDIS_START_URLS_BATCH_SIZE', 55 | settings.getint('CONCURRENT_REQUESTS'), 56 | ) 57 | 58 | try: 59 | self.redis_batch_size = int(self.redis_batch_size) 60 | except (TypeError, ValueError): 61 | raise ValueError("redis_batch_size must be an integer") 62 | 63 | if self.redis_encoding is None: 64 | self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) 65 | 66 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 67 | "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 68 | self.__dict__) 69 | 70 | self.server = connection.from_settings(crawler.settings) 71 | # The idle signal is called when the spider has no requests left, 72 | # that's when we will schedule new requests from redis queue 73 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 74 | 75 | def next_requests(self): 76 | """Returns a request to be scheduled or none.""" 77 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) 78 | fetch_one = self.server.spop if use_set else self.server.lpop 79 | # XXX: Do we need to use a timeout here? 80 | found = 0 81 | # TODO: Use redis pipeline execution. 82 | while found < self.redis_batch_size: 83 | data = fetch_one(self.redis_key) 84 | if not data: 85 | # Queue empty. 86 | break 87 | req = self.make_request_from_data(data) 88 | if req: 89 | yield req 90 | found += 1 91 | else: 92 | self.logger.debug("Request not made from data: %r", data) 93 | 94 | if found: 95 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 96 | 97 | def make_request_from_data(self, data): 98 | """Returns a Request instance from data coming from Redis. 99 | 100 | By default, ``data`` is an encoded URL. You can override this method to 101 | provide your own message decoding. 102 | 103 | Parameters 104 | ---------- 105 | data : bytes 106 | Message from redis. 107 | 108 | """ 109 | url = bytes_to_str(data, self.redis_encoding) 110 | return self.make_requests_from_url(url) 111 | 112 | def schedule_next_requests(self): 113 | """Schedules a request if available""" 114 | # TODO: While there is capacity, schedule a batch of redis requests. 115 | for req in self.next_requests(): 116 | self.crawler.engine.crawl(req, spider=self) 117 | 118 | def spider_idle(self): 119 | """Schedules a request if available, otherwise waits.""" 120 | # XXX: Handle a sentinel to close the spider. 121 | self.schedule_next_requests() 122 | raise DontCloseSpider 123 | 124 | 125 | class RedisSpider(RedisMixin, Spider): 126 | """Spider that reads urls from redis queue when idle. 127 | 128 | Attributes 129 | ---------- 130 | redis_key : str (default: REDIS_START_URLS_KEY) 131 | Redis key where to fetch start URLs from.. 132 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 133 | Number of messages to fetch from redis on each attempt. 134 | redis_encoding : str (default: REDIS_ENCODING) 135 | Encoding to use when decoding messages from redis queue. 136 | 137 | Settings 138 | -------- 139 | REDIS_START_URLS_KEY : str (default: ":start_urls") 140 | Default Redis key where to fetch start URLs from.. 141 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 142 | Default number of messages to fetch from redis on each attempt. 143 | REDIS_START_URLS_AS_SET : bool (default: False) 144 | Use SET operations to retrieve messages from the redis queue. If False, 145 | the messages are retrieve using the LPOP command. 146 | REDIS_ENCODING : str (default: "utf-8") 147 | Default encoding to use when decoding messages from redis queue. 148 | 149 | """ 150 | 151 | @classmethod 152 | def from_crawler(self, crawler, *args, **kwargs): 153 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 154 | obj.setup_redis(crawler) 155 | return obj 156 | 157 | 158 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 159 | """Spider that reads urls from redis queue when idle. 160 | 161 | Attributes 162 | ---------- 163 | redis_key : str (default: REDIS_START_URLS_KEY) 164 | Redis key where to fetch start URLs from.. 165 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 166 | Number of messages to fetch from redis on each attempt. 167 | redis_encoding : str (default: REDIS_ENCODING) 168 | Encoding to use when decoding messages from redis queue. 169 | 170 | Settings 171 | -------- 172 | REDIS_START_URLS_KEY : str (default: ":start_urls") 173 | Default Redis key where to fetch start URLs from.. 174 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 175 | Default number of messages to fetch from redis on each attempt. 176 | REDIS_START_URLS_AS_SET : bool (default: True) 177 | Use SET operations to retrieve messages from the redis queue. 178 | REDIS_ENCODING : str (default: "utf-8") 179 | Default encoding to use when decoding messages from redis queue. 180 | 181 | """ 182 | 183 | @classmethod 184 | def from_crawler(self, crawler, *args, **kwargs): 185 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 186 | obj.setup_redis(crawler) 187 | return obj 188 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mock 4 | import redis 5 | 6 | from scrapy import Request, Spider 7 | from unittest import TestCase 8 | 9 | from . import connection 10 | from .dupefilter import RFPDupeFilter 11 | from .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack 12 | from .scheduler import Scheduler 13 | 14 | 15 | # allow test settings from environment 16 | REDIS_HOST = os.environ.get('REDIST_HOST', 'localhost') 17 | REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) 18 | 19 | 20 | class RedisTestMixin(object): 21 | 22 | @property 23 | def server(self): 24 | if not hasattr(self, '_redis'): 25 | self._redis = redis.Redis(REDIS_HOST, REDIS_PORT) 26 | return self._redis 27 | 28 | def clear_keys(self, prefix): 29 | keys = self.server.keys(prefix + '*') 30 | if keys: 31 | self.server.delete(*keys) 32 | 33 | 34 | class DupeFilterTest(RedisTestMixin, TestCase): 35 | 36 | def setUp(self): 37 | self.key = 'scrapy_redis:tests:dupefilter:' 38 | self.df = RFPDupeFilter(self.server, self.key) 39 | 40 | def tearDown(self): 41 | self.clear_keys(self.key) 42 | 43 | def test_dupe_filter(self): 44 | req = Request('http://example.com') 45 | 46 | self.assertFalse(self.df.request_seen(req)) 47 | self.assertTrue(self.df.request_seen(req)) 48 | 49 | self.df.close('nothing') 50 | 51 | 52 | class QueueTestMixin(RedisTestMixin): 53 | 54 | queue_cls = None 55 | 56 | def setUp(self): 57 | self.spider = Spider('myspider') 58 | self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name 59 | self.q = self.queue_cls(self.server, Spider('myspider'), self.key) 60 | 61 | def tearDown(self): 62 | self.clear_keys(self.key) 63 | 64 | def test_clear(self): 65 | self.assertEqual(len(self.q), 0) 66 | 67 | for i in range(10): 68 | # XXX: can't use same url for all requests as SpiderPriorityQueue 69 | # uses redis' set implemention and we will end with only one 70 | # request in the set and thus failing the test. It should be noted 71 | # that when using SpiderPriorityQueue it acts as a request 72 | # duplication filter whenever the serielized requests are the same. 73 | # This might be unwanted on repetitive requests to the same page 74 | # even with dont_filter=True flag. 75 | req = Request('http://example.com/?page=%s' % i) 76 | self.q.push(req) 77 | self.assertEqual(len(self.q), 10) 78 | 79 | self.q.clear() 80 | self.assertEqual(len(self.q), 0) 81 | 82 | 83 | class SpiderQueueTest(QueueTestMixin, TestCase): 84 | 85 | queue_cls = SpiderQueue 86 | 87 | def test_queue(self): 88 | req1 = Request('http://example.com/page1') 89 | req2 = Request('http://example.com/page2') 90 | 91 | self.q.push(req1) 92 | self.q.push(req2) 93 | 94 | out1 = self.q.pop() 95 | out2 = self.q.pop() 96 | 97 | self.assertEqual(out1.url, req1.url) 98 | self.assertEqual(out2.url, req2.url) 99 | 100 | 101 | class SpiderPriorityQueueTest(QueueTestMixin, TestCase): 102 | 103 | queue_cls = SpiderPriorityQueue 104 | 105 | def test_queue(self): 106 | req1 = Request('http://example.com/page1', priority=100) 107 | req2 = Request('http://example.com/page2', priority=50) 108 | req3 = Request('http://example.com/page2', priority=200) 109 | 110 | self.q.push(req1) 111 | self.q.push(req2) 112 | self.q.push(req3) 113 | 114 | out1 = self.q.pop() 115 | out2 = self.q.pop() 116 | out3 = self.q.pop() 117 | 118 | self.assertEqual(out1.url, req3.url) 119 | self.assertEqual(out2.url, req1.url) 120 | self.assertEqual(out3.url, req2.url) 121 | 122 | 123 | class SpiderStackTest(QueueTestMixin, TestCase): 124 | 125 | queue_cls = SpiderStack 126 | 127 | def test_queue(self): 128 | req1 = Request('http://example.com/page1') 129 | req2 = Request('http://example.com/page2') 130 | 131 | self.q.push(req1) 132 | self.q.push(req2) 133 | 134 | out1 = self.q.pop() 135 | out2 = self.q.pop() 136 | 137 | self.assertEqual(out1.url, req2.url) 138 | self.assertEqual(out2.url, req1.url) 139 | 140 | 141 | class SchedulerTest(RedisTestMixin, TestCase): 142 | 143 | def setUp(self): 144 | self.persist = False 145 | self.key_prefix = 'scrapy_redis:tests:' 146 | self.queue_key = self.key_prefix + '%(spider)s:requests' 147 | self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' 148 | self.idle_before_close = 0 149 | self.scheduler = Scheduler(self.server, self.persist, self.queue_key, 150 | SpiderQueue, self.dupefilter_key, 151 | self.idle_before_close) 152 | self.spider = Spider('myspider') 153 | 154 | def tearDown(self): 155 | self.clear_keys(self.key_prefix) 156 | 157 | def test_scheduler(self): 158 | # default no persist 159 | self.assertFalse(self.scheduler.persist) 160 | 161 | self.scheduler.open(self.spider) 162 | self.assertEqual(len(self.scheduler), 0) 163 | 164 | req = Request('http://example.com') 165 | self.scheduler.enqueue_request(req) 166 | self.assertTrue(self.scheduler.has_pending_requests()) 167 | self.assertEqual(len(self.scheduler), 1) 168 | 169 | # dupefilter in action 170 | self.scheduler.enqueue_request(req) 171 | self.assertEqual(len(self.scheduler), 1) 172 | 173 | out = self.scheduler.next_request() 174 | self.assertEqual(out.url, req.url) 175 | 176 | self.assertFalse(self.scheduler.has_pending_requests()) 177 | self.assertEqual(len(self.scheduler), 0) 178 | 179 | self.scheduler.close('finish') 180 | 181 | def test_scheduler_persistent(self): 182 | # TODO: Improve this test to avoid the need to check for log messages. 183 | self.spider.log = mock.Mock(spec=self.spider.log) 184 | 185 | self.scheduler.persist = True 186 | self.scheduler.open(self.spider) 187 | 188 | self.assertEqual(self.spider.log.call_count, 0) 189 | 190 | self.scheduler.enqueue_request(Request('http://example.com/page1')) 191 | self.scheduler.enqueue_request(Request('http://example.com/page2')) 192 | 193 | self.assertTrue(self.scheduler.has_pending_requests()) 194 | self.scheduler.close('finish') 195 | 196 | self.scheduler.open(self.spider) 197 | self.spider.log.assert_has_calls([ 198 | mock.call("Resuming crawl (2 requests scheduled)"), 199 | ]) 200 | self.assertEqual(len(self.scheduler), 2) 201 | 202 | self.scheduler.persist = False 203 | self.scheduler.close('finish') 204 | 205 | self.assertEqual(len(self.scheduler), 0) 206 | 207 | 208 | class ConnectionTest(TestCase): 209 | 210 | # We can get a connection from just REDIS_URL. 211 | def test_redis_url(self): 212 | settings = dict( 213 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 214 | ) 215 | 216 | server = connection.from_settings(settings) 217 | connect_args = server.connection_pool.connection_kwargs 218 | 219 | self.assertEqual(connect_args['host'], 'localhost') 220 | self.assertEqual(connect_args['port'], 9001) 221 | self.assertEqual(connect_args['password'], 'bar') 222 | self.assertEqual(connect_args['db'], 42) 223 | 224 | # We can get a connection from REDIS_HOST/REDIS_PORT. 225 | def test_redis_host_port(self): 226 | settings = dict( 227 | REDIS_HOST = 'localhost', 228 | REDIS_PORT = 9001 229 | ) 230 | 231 | server = connection.from_settings(settings) 232 | connect_args = server.connection_pool.connection_kwargs 233 | 234 | self.assertEqual(connect_args['host'], 'localhost') 235 | self.assertEqual(connect_args['port'], 9001) 236 | 237 | # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. 238 | def test_redis_url_precedence(self): 239 | settings = dict( 240 | REDIS_HOST = 'baz', 241 | REDIS_PORT = 1337, 242 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 243 | ) 244 | 245 | server = connection.from_settings(settings) 246 | connect_args = server.connection_pool.connection_kwargs 247 | 248 | self.assertEqual(connect_args['host'], 'localhost') 249 | self.assertEqual(connect_args['port'], 9001) 250 | self.assertEqual(connect_args['password'], 'bar') 251 | self.assertEqual(connect_args['db'], 42) 252 | 253 | # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. 254 | def test_redis_host_port_fallback(self): 255 | settings = dict( 256 | REDIS_HOST = 'baz', 257 | REDIS_PORT = 1337, 258 | REDIS_URL = None 259 | ) 260 | 261 | server = connection.from_settings(settings) 262 | connect_args = server.connection_pool.connection_kwargs 263 | 264 | self.assertEqual(connect_args['host'], 'baz') 265 | self.assertEqual(connect_args['port'], 1337) 266 | 267 | # We use default values for REDIS_HOST/REDIS_PORT. 268 | def test_redis_default(self): 269 | settings = dict() 270 | 271 | server = connection.from_settings(settings) 272 | connect_args = server.connection_pool.connection_kwargs 273 | 274 | self.assertEqual(connect_args['host'], 'localhost') 275 | self.assertEqual(connect_args['port'], 6379) 276 | -------------------------------------------------------------------------------- /zhihu/zhihu/scrapy_redis/utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def bytes_to_str(s, encoding='utf-8'): 5 | """Returns a str if a bytes object is given.""" 6 | if six.PY3 and isinstance(s, bytes): 7 | return s.decode(encoding) 8 | return s -------------------------------------------------------------------------------- /zhihu/zhihu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # ------------------------------------------ 4 | # 版本:1.0 5 | # 日期:2017-8-06 6 | # 作者:AlexTan 7 | # 8 | # 9 | # ------------------------------------------ 10 | 11 | BOT_NAME = 'zhihu' 12 | 13 | SPIDER_MODULES = ['zhihu.spiders'] 14 | NEWSPIDER_MODULE = 'zhihu.spiders' 15 | 16 | 17 | 18 | 19 | REDIRECT_ENABLED = False 20 | RETRY_TIMES = 1 21 | DOWNLOAD_TIMEOUT = 10 #下载超时时间 22 | 23 | 24 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 25 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36' 26 | 27 | 28 | #分布式配置 29 | SCHEDULER = "zhihu.scrapy_redis.scheduler.Scheduler" 30 | SCHEDULER_PERSIST = True 31 | DUPEFILTER_CLASS = "zhihu.scrapy_redis.dupefilter.RFPDupeFilter" 32 | 33 | 34 | # 种子队列的信息 35 | REDIS_URL = None 36 | REDIS_HOST = '127.0.0.1' 37 | REDIS_PORT = 6379#6379 38 | FILTER_URL = None 39 | FILTER_HOST = '127.0.0.1' 40 | FILTER_PORT = 6379#6379 41 | FILTER_DB = 0 42 | 43 | 44 | 45 | MONGO_URI = 'mongodb://127.0.0.1:27017/' 46 | MONGO_DATABASE = 'zhihu3' 47 | 48 | 49 | DOWNLOADER_MIDDLEWARES = { 50 | 'zhihu.middlewares.UserAgentMiddleware': 543, 51 | 'zhihu.middlewares.CookiesMiddleware': 544, 52 | #'zhihu.middlewares.ProxyMiddleware':125, 53 | #"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 545, 54 | } 55 | 56 | 57 | ITEM_PIPELINES = { 58 | 'zhihu.pipelines.ZhihuPipeline': 301, 59 | } 60 | 61 | ''' 62 | DOWNLOAD_DELAY = 3 63 | AUTOTHROTTLE_ENABLED = True 64 | AUTOTHROTTLE_START_DELAY = 3 65 | AUTOTHROTTLE_MAX_DELAY = 60 66 | ''' 67 | 68 | # Obey robots.txt rules 69 | #ROBOTSTXT_OBEY = True 70 | 71 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 72 | #CONCURRENT_REQUESTS = 1 73 | 74 | # Configure a delay for requests for the same website (default: 0) 75 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 76 | # See also autothrottle settings and docs 77 | #DOWNLOAD_DELAY = 3 78 | # The download delay setting will honor only one of: 79 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 80 | #CONCURRENT_REQUESTS_PER_IP = 16 81 | 82 | # Disable cookies (enabled by default) 83 | #COOKIES_ENABLED = False 84 | 85 | # Disable Telnet Console (enabled by default) 86 | #TELNETCONSOLE_ENABLED = False 87 | 88 | # Override the default request headers: 89 | #DEFAULT_REQUEST_HEADERS = { 90 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 91 | # 'Accept-Language': 'en', 92 | #} 93 | 94 | # Enable or disable spider middlewares 95 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 96 | #SPIDER_MIDDLEWARES = { 97 | # 'zhihu.middlewares.ZhihuSpiderMiddleware': 543, 98 | #} 99 | 100 | # Enable or disable downloader middlewares 101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 102 | 103 | 104 | # Enable or disable extensions 105 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 106 | #EXTENSIONS = { 107 | # 'scrapy.extensions.telnet.TelnetConsole': None, 108 | #} 109 | 110 | # Configure item pipelines 111 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 112 | 113 | 114 | # Enable and configure the AutoThrottle extension (disabled by default) 115 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 116 | #AUTOTHROTTLE_ENABLED = True 117 | # The initial download delay 118 | #AUTOTHROTTLE_START_DELAY = 5 119 | # The maximum download delay to be set in case of high latencies 120 | #AUTOTHROTTLE_MAX_DELAY = 60 121 | # The average number of requests Scrapy should be sending in parallel to 122 | # each remote server 123 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 124 | # Enable showing throttling stats for every response received: 125 | #AUTOTHROTTLE_DEBUG = False 126 | 127 | # Enable and configure HTTP caching (disabled by default) 128 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 129 | #HTTPCACHE_ENABLED = True 130 | #HTTPCACHE_EXPIRATION_SECS = 0 131 | #HTTPCACHE_DIR = 'httpcache' 132 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 133 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 134 | -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__pycache__/zhihuspider.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/spiders/__pycache__/zhihuspider.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__pycache__/zhihuspider0.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/spiders/__pycache__/zhihuspider0.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__pycache__/zhihuspider1.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexTan-b-z/ZhihuSpider/7f35d157fa7f3a7ac8545b386e98286ee2764462/zhihu/zhihu/spiders/__pycache__/zhihuspider1.cpython-35.pyc -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/zhihuspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import pdb 5 | import json 6 | from selenium import webdriver 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 8 | from ..items import ZhihuItem,RelationItem 9 | from scrapy.http import Request,FormRequest 10 | from scrapy_redis.spiders import RedisSpider 11 | 12 | # ------------------------------------------ 13 | # 版本:1.0 14 | # 日期:2017-8-06 15 | # 作者:AlexTan 16 | # 17 | # 18 | # ------------------------------------------ 19 | 20 | 21 | #zhihuspider1是模拟浏览器爬(速度慢,不建议,仅供学习) zhihuspider0抓包爬(速度快) 22 | class ZhihuspiderSpider(RedisSpider): 23 | #class ZhihuspiderSpider(scrapy.Spider): 24 | name = "zhihuspider1" 25 | #allowed_domains = ["zhihu.com"] 26 | host = 'https://www.zhihu.com' 27 | redis_key = "zhihuspider:start_urls" 28 | #start_urls = ['https://www.zhihu.com/people/yun-he-shu-ju-8/answers'] 29 | strat_user_id = ['yun-he-shu-ju-8'] 30 | #pdb.set_trace() 31 | dcap = dict(DesiredCapabilities.PHANTOMJS) 32 | dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") 33 | dcap["phantomjs.page.settings.loadImages"] = False 34 | obj = webdriver.PhantomJS(desired_capabilities=dcap) 35 | 36 | 37 | def start_requests(self): 38 | for one in self.strat_user_id: 39 | yield Request('https://www.zhihu.com/people/'+one+'/answers',callback=self.parse,dont_filter=True) 40 | #return [Request('https://www.zhihu.com/#signin',callback=self.start_login,meta={'cookiejar':1})] #这个登录已不可用,仅供学习 41 | 42 | def start_login(self,response): 43 | xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first() 44 | return [FormRequest('https://www.zhihu.com/login/phone_num',method='POST',meta={'cookiejar':response.meta['cookiejar']},formdata={ 45 | #'_xsrf':xsrf, 46 | 'password':'88888888', 47 | 'remember_me':"true", 48 | 'phone_num':'666666'}, 49 | callback=self.after_login 50 | )] 51 | 52 | def after_login(self,response): 53 | pdb.set_trace() 54 | if json.loads(response.body)['msg'].encode('utf8') == "登录成功": 55 | self.logger.info("登录成功!%s" % str(response.meta['cookiejar'])) 56 | print("登录成功!") 57 | self.obj.add_cookie(response.meta['cookiejar']) 58 | for one in self.strat_user_id: 59 | yield Request('https://www.zhihu.com/people/'+one+'/answers',meta={'cookiejar':response.meta['cookiejar']},callback=self.parse) 60 | else: 61 | self.logger.error('登录失败') 62 | 63 | def __del__(self): 64 | self.obj.quit() 65 | 66 | def parse(self, response): 67 | item = ZhihuItem() 68 | name = response.xpath('//span[@class="ProfileHeader-name"]/text()').extract()[0] 69 | #pdb.set_trace() 70 | user_image_url = response.xpath('//img[@class="Avatar Avatar--large UserAvatar-inner"]/@srcset').extract()[0].replace(' 2x','') 71 | user_id = re.findall('people\/(.*?)\/',response.url)[0] 72 | gender_icon = response.xpath('.//svg[@class="Icon Icon--male" or @class="Icon Icon--female"]/@class').extract() 73 | #pdb.set_trace() 74 | gender = "" 75 | if gender_icon: 76 | if gender_icon[0] == "Icon Icon--female": 77 | gender = "女" 78 | elif gender_icon[0] == "Icon Icon--male": 79 | gender = "男" 80 | item['name'] = name 81 | item['user_id'] = user_id 82 | item['user_image_url'] = user_image_url 83 | item['gender'] = gender 84 | try: 85 | num = response.xpath('//div[@class="NumberBoard-value"]/text()').extract() 86 | item['followees_num'] = num[0] 87 | item['followers_num'] = num[1] 88 | followees_url = response.url.replace('answers','following') 89 | followers_url = response.url.replace('answers','followers') 90 | relation_item = RelationItem() 91 | relation_item['relations_id'] = [] 92 | relation_item['user_id'] = user_id 93 | relation_item['relation_type'] = 'followees' 94 | yield Request(followees_url,callback=self.relations,meta={'page':1,'item':relation_item}) 95 | relation_item['relation_type'] = 'followers' 96 | yield Request(followers_url,callback=self.relations,meta={'page':1,'item':relation_item}) 97 | except: 98 | print("需要登录!") 99 | 100 | self.obj.get(response.url) 101 | try: 102 | self.obj.find_element_by_class_name('ProfileHeader-expandButton').click() 103 | first = self.obj.find_elements_by_xpath('//div[@class="ProfileHeader-detailItem"]') 104 | for one in first: 105 | label = one.find_element_by_class_name('ProfileHeader-detailLabel').text 106 | if label == "居住地": 107 | location = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') 108 | item['location'] = location 109 | elif label == "所在行业" or "行业": 110 | business = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') 111 | item['business'] = business 112 | elif label == "职业经历": 113 | professional = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') 114 | item['professional'] = professional 115 | elif label == "教育经历": 116 | education = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') 117 | item['education'] = education 118 | else: 119 | pass 120 | except: 121 | pass 122 | yield item 123 | 124 | def relations(self,response): 125 | self.obj.get(response.url) 126 | followees_a = self.obj.find_elements_by_xpath('//a[@class="UserLink-link"]') 127 | #pdb.set_trace() 128 | #followees_a = response.xpath('//a[@class="UserLink-link"]/@href').extract() 129 | followees = [] 130 | for one in followees_a: 131 | try: 132 | one = one.get_attribute('href') 133 | followees.append(one.replace('https://www.zhihu.com/people/','')) 134 | except: 135 | pass 136 | followees = list(set(followees)) 137 | #pdb.set_trace() 138 | response.meta['item']['relations_id']+=followees 139 | nextpage_button = response.xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]').extract() 140 | if nextpage_button: 141 | #pdb.set_trace() 142 | nextpage_url = response.url.replace('?page='+str(response.meta['page']),'') + "?page=" + str(response.meta['page']+1) 143 | yield Request(nextpage_url,callback=self.relations,meta={'page':response.meta['page']+1,'item':response.meta['item']}) 144 | else: 145 | yield response.meta['item'] 146 | for user in followees: 147 | yield Request('https://www.zhihu.com/people/'+user+'/answers',callback=self.parse) 148 | -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/zhihuspider0.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import pdb 5 | import json 6 | from scrapy.http import Request 7 | from ..items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem 8 | from ..scrapy_redis.spiders import RedisSpider 9 | 10 | # ------------------------------------------ 11 | # 版本:1.0 12 | # 日期:2017-8-06 13 | # 作者:AlexTan 14 | # 15 | # 16 | # ------------------------------------------ 17 | 18 | class Zhihuspider0Spider(RedisSpider): 19 | name = 'zhihuspider' 20 | redis_key = "zhihuspider:start_urls" 21 | allowed_domains = ['zhihu.com'] 22 | start_urls = ['http://zhihu.com/'] 23 | strat_user_id = ['yun-he-shu-ju-8'] 24 | 25 | def start_requests(self): 26 | for one in self.strat_user_id: 27 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse) 28 | 29 | 30 | def parse(self, response): 31 | json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') 32 | dict_result = eval(json_result) 33 | item = ZhihuItem() 34 | if dict_result['gender'] == 1: 35 | item['gender'] = '男' 36 | elif dict_result['gender'] == 0: 37 | item['gender'] = '女' 38 | else: 39 | item['gender'] = '未知' 40 | item['user_id'] = dict_result['url_token'] 41 | item['user_image_url'] = dict_result['avatar_url'][:-6] + 'xl.jpg' 42 | item['name'] = dict_result['name'] 43 | item['locations'] = [] 44 | for one in dict_result['locations']: 45 | item['locations'].append(one['name']) 46 | try: 47 | item['business'] = dict_result['business']['name'] 48 | except: 49 | try: 50 | item['business'] = dict_result['industry_category'] 51 | except: 52 | pass 53 | 54 | item['education'] = [] 55 | for one in dict_result['educations']: 56 | try: 57 | education = one['school']['name'] + ":" + one['major']['name'] 58 | except: 59 | try: 60 | education = one['school']['name'] 61 | except: 62 | pass 63 | item['education'].append(education) 64 | #pdb.set_trace() 65 | item['followees_num'] = dict_result['following_count'] 66 | item['followers_num'] = dict_result['follower_count'] 67 | item['employments'] = [] 68 | for one in dict_result['employments']: 69 | try: 70 | employment = one['company']['name'] + ":" + one['job']['name'] 71 | except: 72 | try: 73 | employment = one['company']['name'] 74 | except: 75 | pass 76 | item['employments'].append(employment) 77 | #pdb.set_trace() 78 | yield item 79 | item = RelationItem() 80 | one = response.meta['user_id'] 81 | item['relations_id'] = [] 82 | item['user_id'] = one 83 | item['relation_type'] = '' 84 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followers?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followers'}) 85 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followees?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followees'}) 86 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'/answers?include=data[*].comment_count,content,voteup_count,created_time,updated_time;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_answer,meta={'answer_user_id':one,'offset':0}) 87 | yield Request('https://www.zhihu.com/people/'+one+'/asks?page=1',callback=self.parse_question,meta={'ask_user_id':one,'page':1}) 88 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'/articles?include=data[*].comment_count,content,voteup_count,created,updated;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_article,meta={'author_id':one,'offset':0}) 89 | 90 | def parse_relation(self,response): 91 | json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') 92 | dict_result = eval(json_result) 93 | relations_id = [] 94 | for one in dict_result['data']: 95 | relations_id.append(one['url_token']) 96 | response.meta['item']['relations_id'] = relations_id 97 | if response.meta['offset'] == 0: 98 | response.meta['item']['relation_type'] = response.meta['relation_type'] 99 | else: 100 | response.meta['item']['relation_type'] = 'next:' + response.meta['relation_type'] 101 | #pdb.set_trace() 102 | yield response.meta['item'] 103 | for one in response.meta['item']['relations_id']: 104 | yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse) 105 | #pdb.set_trace() 106 | if dict_result['paging']['is_end'] == 0: 107 | #pdb.set_trace() 108 | offset = response.meta['offset'] + 20 109 | next_page = re.findall('(.*offset=)\d+',response.url)[0] 110 | #pdb.set_trace() 111 | yield Request(next_page + str(offset),callback=self.parse_relation,meta={'item':response.meta['item'],'offset':offset,'relation_type':response.meta['relation_type']}) 112 | 113 | def parse_answer(self,response): 114 | json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') 115 | dict_result = eval(json_result) 116 | for one in dict_result['data']: 117 | item = AnswerItem() 118 | item['answer_user_id'] = response.meta['answer_user_id'] 119 | item['answer_id'] = one['id'] 120 | item['question_id'] = one['question']['id'] 121 | #pdb.set_trace() 122 | item['cretated_time'] = one['created_time'] 123 | item['updated_time'] = one['updated_time'] 124 | item['voteup_count'] = one['voteup_count'] 125 | item['comment_count'] = one['comment_count'] 126 | item['content'] = one['content'] 127 | yield item 128 | if dict_result['paging']['is_end'] == 0: 129 | offset = response.meta['offset'] + 20 130 | next_page = re.findall('(.*offset=)\d+',response.url)[0] 131 | yield Request(next_page + str(offset),callback=self.parse_answer,meta={'answer_user_id':response.meta['answer_user_id'],'offset':offset}) 132 | 133 | def parse_question(self,response): 134 | list_item = response.xpath('//div[@class="List-item"]') 135 | for one in list_item: 136 | item = QuestionItem() 137 | item['ask_user_id'] = response.meta['ask_user_id'] 138 | title = one.xpath('.//div[@class="QuestionItem-title"]') 139 | item['title'] = title.xpath('./a/text()').extract()[0] 140 | item['question_id'] = title.xpath('./a/@href').extract()[0].replace('/question/','') 141 | content_item = one.xpath('.//div[@class="ContentItem-status"]//span/text()').extract() 142 | item['ask_time'] = content_item[0] 143 | item['answer_count'] = content_item[1] 144 | item['followees_count'] = content_item[2] 145 | yield item 146 | next_page = response.xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]/text()').extract() 147 | if next_page: 148 | response.meta['page'] += 1 149 | next_url = re.findall('(.*page=)\d+',response.url)[0] + str(response.meta['page']) 150 | yield Request(next_url,callback=self.parse_question,meta={'ask_user_id':response.meta['ask_user_id'],'page':response.meta['page']}) 151 | 152 | def parse_article(self,response): 153 | json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') 154 | dict_result = eval(json_result) 155 | for one in dict_result['data']: 156 | item = ArticleItem() 157 | item['author_id'] = response.meta['author_id'] 158 | item['title'] = one['title'] 159 | item['article_id'] = one['id'] 160 | item['content'] = one['content'] 161 | #pdb.set_trace() 162 | item['cretated_time'] = one['created'] 163 | item['updated_time'] = one['updated'] 164 | item['voteup_count'] = one['voteup_count'] 165 | item['comment_count'] = one['comment_count'] 166 | yield item 167 | if dict_result['paging']['is_end'] == 0: 168 | offset = response.meta['offset'] + 20 169 | next_page = re.findall('(.*offset=)\d+',response.url)[0] 170 | yield Request(next_page + str(offset),callback=self.parse_article,meta={'author_id':response.meta['author_id'],'offset':offset}) 171 | -------------------------------------------------------------------------------- /zhihu/zhihu/user_agents_pc.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | # ------------------------------------------ 4 | # 版本:1.0 5 | # 日期:2017-8-06 6 | # 作者:AlexTan 7 | # 8 | # 9 | # ------------------------------------------ 10 | 11 | agents = [ 12 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0", 13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0", 14 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", 15 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", 16 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", 17 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", 18 | ] -------------------------------------------------------------------------------- /zhihu/zhihu/yumdama.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import http.client, mimetypes, urllib, json, time, requests 3 | import pdb 4 | 5 | ###################################################################### 6 | 7 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html 8 | # 所有函数请查询 http://www.yundama.com/apidoc 9 | 10 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 11 | # 2. http://www.yundama.com/developer/myapp 添加新软件 12 | # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 13 | 14 | # 用户名 15 | username = '' 16 | 17 | # 密码 18 | password = '' 19 | 20 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 21 | appid = 0000 22 | 23 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 24 | appkey = '' 25 | 26 | # 图片文件 27 | filename = 'captcha.png' 28 | 29 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 30 | codetype = 1004 31 | 32 | # 超时时间,秒 33 | timeout = 60 34 | 35 | 36 | ###################################################################### 37 | 38 | class YDMHttp(): 39 | 40 | apiurl = 'http://api.yundama.com/api.php' 41 | username = '' 42 | password = '' 43 | appid = '' 44 | appkey = '' 45 | 46 | def __init__(self, username, password, appid, appkey): 47 | self.username = username 48 | self.password = password 49 | self.appid = str(appid) 50 | self.appkey = appkey 51 | 52 | def request(self, fields, files=[]): 53 | response = self.post_url(self.apiurl, fields, files) 54 | response = json.loads(response) 55 | return response 56 | 57 | def balance(self): 58 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 59 | response = self.request(data) 60 | if (response): 61 | if (response['ret'] and response['ret'] < 0): 62 | return response['ret'] 63 | else: 64 | return response['balance'] 65 | else: 66 | return -9001 67 | 68 | def login(self): 69 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 70 | response = self.request(data) 71 | if (response): 72 | if (response['ret'] and response['ret'] < 0): 73 | return response['ret'] 74 | else: 75 | return response['uid'] 76 | else: 77 | return -9001 78 | 79 | def upload(self, filename, codetype, timeout): 80 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 81 | file = {'file': filename} 82 | response = self.request(data, file) 83 | if (response): 84 | if (response['ret'] and response['ret'] < 0): 85 | return response['ret'] 86 | else: 87 | return response['cid'] 88 | else: 89 | return -9001 90 | 91 | def result(self, cid): 92 | data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 93 | response = self.request(data) 94 | return response and response['text'] or '' 95 | 96 | def decode(self, filename, codetype, timeout): 97 | cid = self.upload(filename, codetype, timeout) 98 | if (cid > 0): 99 | for i in range(0, timeout): 100 | result = self.result(cid) 101 | if (result != ''): 102 | return cid, result 103 | else: 104 | time.sleep(1) 105 | return -3003, '' 106 | else: 107 | return cid, '' 108 | 109 | def post_url(self, url, fields, files=[]): 110 | for key in files: 111 | files[key] = open(files[key], 'rb'); 112 | res = requests.post(url, files=files, data=fields) 113 | return res.text 114 | 115 | 116 | ###################################################################### 117 | 118 | 119 | def identify(): 120 | if (username == 'username'): 121 | print ('请设置好相关参数再测试') 122 | else: 123 | #pdb.set_trace() 124 | # 初始化 125 | yundama = YDMHttp(username, password, appid, appkey) 126 | 127 | # 登陆云打码 128 | uid = yundama.login() 129 | # print 'uid: %s' % uid 130 | 131 | # 查询余额 132 | balance = yundama.balance() 133 | # print 'balance: %s' % balance 134 | 135 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 136 | cid, result = yundama.decode(filename, codetype, timeout) 137 | # print 'cid: %s, result: %s' % (cid, result) 138 | return result 139 | --------------------------------------------------------------------------------