├── README.md ├── Zhihu ├── __init__.py ├── captcha │ ├── chinese_captcha.jpeg │ └── english_captcha.jpeg ├── cookies │ └── zhihu.cookies ├── failed_urls │ └── failed_urls.json ├── items.py ├── libs │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── bloomfilter.cpython-36.pyc │ │ ├── chaojiying.cpython-36.pyc │ │ ├── common.cpython-36.pyc │ │ ├── crawl_ip_proxy.cpython-36.pyc │ │ └── proxy.cpython-36.pyc │ ├── bloomfilter.py │ ├── chaojiying.py │ ├── common.py │ ├── proxy.py │ └── zheye_test.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders │ ├── __init__.py │ └── zhihu.py ├── zheye │ ├── Kaiti-SC-Bold.ttf │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── util.cpython-36.pyc │ ├── util.py │ ├── zheyeV3.keras │ ├── zheyeV4.keras │ └── zheyeV5.keras └── zhihu_image │ ├── a.gif │ ├── b.gif │ ├── c.gif │ ├── captcha (10).gif │ ├── captcha (12).gif │ ├── captcha (4).gif │ ├── captcha (6).gif │ ├── captcha-3.gif │ ├── d.gif │ └── e.gif ├── main.py ├── requirements.txt ├── scrapy.cfg └── scrapy_redis ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc ├── connection.cpython-36.pyc ├── defaults.cpython-36.pyc ├── dupefilter.cpython-36.pyc ├── picklecompat.cpython-36.pyc ├── pipelines.cpython-36.pyc ├── queue.cpython-36.pyc ├── scheduler.cpython-36.pyc ├── spiders.cpython-36.pyc └── utils.cpython-36.pyc ├── connection.py ├── defaults.py ├── dupefilter.py ├── picklecompat.py ├── pipelines.py ├── queue.py ├── scheduler.py ├── spiders.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy-Redis-Zhihu项目介绍 2 | 3 | 1. 基于scrapy-redis实现分布式爬虫,爬取知乎所有问题及对应的回答; 4 | 2. 集成selenium模拟登录知乎,并处理英文验证码及倒立文字验证码的识别; 5 | 3. 通过Twisted将MySQL 入库操作变成异步化执行; 6 | 4. 集成bloomfilter对URL进行去重; 7 | 5. 随机生成User-Agent、IP代理应该反爬; 8 | 6. 通过scrapy信号机制,统计爬取的URL总数; 9 | 7. 通过Scrapy数据收集机制,获取爬取失败的URL,并写入到json文件中,方便后期进行分析。 10 | 11 | ## Scrapy-Redis-Zhihu项目结构介绍 12 | 13 | captcha: 存放知乎登录页面英文验证码或倒立文字验证码图片 14 | 15 | cookies: 存放登录之后获取到的cookies 16 | 17 | failed_urls: 存放爬取失败的url信息 18 | 19 | libs:存放Scrapy编写过程中需要用到的函数 20 | 21 | libs.bloomfilter: 布隆过滤器,对url进行去重 22 | 23 | libs.chaojiying: 英文验证码识别 24 | 25 | libs.common: 其他函数 26 | 27 | libs.proxy: 获取西刺ip代理 28 | 29 | spiders: 项目文件 30 | 31 | zheye: 倒立文字验证码识别相关文件 32 | 33 | ## Scrapy-Redis-Zhihu重要方法介绍 34 | 35 | spiders.zhihu.py: 36 | get_cookies:模拟登录知乎,将登录后的cookies写入文件中,并返回登录之后的cookies 37 | 38 | deal_with_chinese_captcha: 倒立验证码的识别 39 | 40 | deal_with_english_captcha: 英文验证码的识别 41 | 42 | middlewares.RedirectDealDownloaderMiddleware.process_response: 因为scrapy-redis中的start_requests已经被重写过了,无法将登录后的cookies传入到Response中,所以在这里进行捕获登录页面,模拟登录,并将获取登录后的cookies并传入到Response中,同时处理302重定向到登录页面问题 43 | 44 | ## 如何使用 45 | ### 安装依赖 46 | ``` 47 | git clone https://github.com/Yanxueshan/Scrapy-Redis-Zhihu.git 48 | cd Scrapy-Redis-Zhihu 49 | pip install -r requirements.txt 50 | ``` 51 | 52 | ### 参数修改 53 | settings.py中的某些参数需要修改 54 | 55 | 这是数据库MySQL相关配置,修改为自己的MySQL配置 56 | ``` 57 | MYSQL_HOST = 'localhost' 58 | MYSQL_DBNAME = 'zhihu' 59 | MYSQL_USER = 'root' 60 | MYSQL_PASSWORD = 'root' 61 | ``` 62 | 63 | 这是知乎的账号和密码,供模拟登录使用,修改为自己的知乎账号和密码 64 | ``` 65 | ZHIHU_ACCOUNT = 'username' 66 | ZHIHU_PASSWORD = 'password' 67 | ``` 68 | 69 | 这是超级鹰的账号,用来识别英文验证码,修改为自己的超级鹰账号(也可以换成其他第三方平台,不过相应的zhihu.py中的代码要改变) 70 | ``` 71 | CHAOJIYING_ACCOUNT = 'username' 72 | CHAOJIYING_PASSWORD = 'password' 73 | CAPTCHA_TYPE = '898966' 74 | ``` 75 | 76 | ### 运行前准备 77 | 切换到redis安装目录下,启动redis-server 78 | ``` 79 | cd redis 80 | redis-server.exe redis.windows.conf 81 | ``` 82 | 另起一个窗口,启动redis-cli 83 | ``` 84 | cd redis 85 | redis-cli 86 | ``` 87 | 88 | ### 运行 89 | ``` 90 | redis-cli lpush zhihu:start_urls http://www.zhihu.com/signin 91 | cd Scrapy-Redis-Zhihu 92 | python main.py 93 | ``` 94 | -------------------------------------------------------------------------------- /Zhihu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/__init__.py -------------------------------------------------------------------------------- /Zhihu/captcha/chinese_captcha.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/captcha/chinese_captcha.jpeg -------------------------------------------------------------------------------- /Zhihu/captcha/english_captcha.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/captcha/english_captcha.jpeg -------------------------------------------------------------------------------- /Zhihu/cookies/zhihu.cookies: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/cookies/zhihu.cookies -------------------------------------------------------------------------------- /Zhihu/failed_urls/failed_urls.json: -------------------------------------------------------------------------------- 1 | {"failed_urls": []} -------------------------------------------------------------------------------- /Zhihu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from datetime import datetime 4 | from settings import SQL_DATETIME_FORMAT 5 | 6 | 7 | class ZhihuQuestionItem(scrapy.Item): 8 | ''' 9 | zhihu's question item design 10 | ''' 11 | question_id = scrapy.Field() 12 | topics = scrapy.Field() 13 | question_url = scrapy.Field() 14 | title = scrapy.Field() 15 | content = scrapy.Field() 16 | create_time = scrapy.Field() 17 | update_time = scrapy.Field() 18 | answer_nums = scrapy.Field() 19 | comment_nums = scrapy.Field() 20 | watch_user_nums = scrapy.Field() 21 | click_nums = scrapy.Field() 22 | crawl_time = scrapy.Field() 23 | crawl_update_time = scrapy.Field() 24 | 25 | def get_insert_sql(self): 26 | ''' 27 | get insert_sql and parameters of question 28 | ''' 29 | insert_sql = "insert into question(question_id, topics, question_url, title, content, answer_nums, " \ 30 | "comment_nums, watch_user_nums, click_nums, crawl_time)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \ 31 | "%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), answer_nums=VALUES(" \ 32 | "answer_nums),comment_nums=VALUES(comment_nums), watch_user_nums=VALUES" \ 33 | "(watch_user_nums),click_nums=VALUES(click_nums)" 34 | 35 | parameters = ( 36 | self['question_id'], self['topics'], self['question_url'], 37 | self['title'], self['content'], self['answer_nums'], 38 | self['comment_nums'], self['watch_user_nums'], 39 | self['click_nums'], self['crawl_time'] 40 | ) 41 | return insert_sql, parameters 42 | 43 | 44 | class ZhihuAnswerItem(scrapy.Item): 45 | ''' 46 | zhihu's answer item design 47 | ''' 48 | answer_id = scrapy.Field() 49 | question_id = scrapy.Field() 50 | answer_url = scrapy.Field() 51 | author_id = scrapy.Field() 52 | content = scrapy.Field() 53 | praise_nums = scrapy.Field() 54 | comment_nums = scrapy.Field() 55 | create_time = scrapy.Field() 56 | update_time = scrapy.Field() 57 | crawl_time = scrapy.Field() 58 | crawl_update_time = scrapy.Field() 59 | 60 | def get_insert_sql(self): 61 | ''' 62 | get insert_sql and parameters of answer 63 | ''' 64 | insert_sql = "insert into answer(answer_id, question_id, answer_url, author_id, content, praise_nums, " \ 65 | "comment_nums, create_time, update_time, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \ 66 | "%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), praise_nums=VALUES(" \ 67 | "praise_nums), comment_nums=VALUES(comment_nums), update_time=VALUES(update_time)" 68 | 69 | create_time = datetime.fromtimestamp(self['create_time']).strftime(SQL_DATETIME_FORMAT) 70 | update_time = datetime.fromtimestamp(self['update_time']).strftime(SQL_DATETIME_FORMAT) 71 | 72 | parameters = ( 73 | self['answer_id'], self['question_id'], self['answer_url'], 74 | self['author_id'], self['content'], self['praise_nums'], 75 | self['comment_nums'], create_time, update_time, self['crawl_time'] 76 | ) 77 | return insert_sql, parameters 78 | -------------------------------------------------------------------------------- /Zhihu/libs/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Yan' 2 | __date__ = '2019/3/25 20:56' -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/bloomfilter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/bloomfilter.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/chaojiying.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/chaojiying.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/crawl_ip_proxy.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/crawl_ip_proxy.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/__pycache__/proxy.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/libs/__pycache__/proxy.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/libs/bloomfilter.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | import BitVector 3 | import redis 4 | import math 5 | import time 6 | 7 | 8 | class BloomFilter(): 9 | # 内置100个随机种子 10 | SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 11 | 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 12 | 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 13 | 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 14 | 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] 15 | 16 | # capacity是预先估计要去重的数量 17 | # error_rate表示错误率 18 | # conn表示redis的连接客户端 19 | # key表示在redis中的键的名字前缀 20 | def __init__(self, capacity=10000, error_rate=0.00000001, conn=None, key='BloomFilter'): 21 | self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) # 需要的总bit位数 22 | self.k = math.ceil(math.log1p(2)*self.m/capacity) # 需要最少的hash次数 23 | self.mem = math.ceil(self.m/8/1024/1024) # 需要的多少M内存 24 | self.blocknum = math.ceil(self.mem/512) # 需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块 25 | self.seeds = self.SEEDS[0:self.k] 26 | self.key = key 27 | self.N = 2**31-1 28 | self.redis = conn 29 | if not self.redis: 30 | # 默认如果没有redis连接,在内存中使用512M的内存块去重 31 | self.bitset = BitVector.BitVector(size=1<<32) 32 | # print(self.mem) 33 | # print(self.k) 34 | 35 | def add(self, value): 36 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 37 | hashs = self.get_hashs(value) 38 | for hash in hashs: 39 | if self.redis: 40 | self.redis.setbit(name, hash, 1) 41 | else: 42 | self.bitset[hash] = 1 43 | 44 | def is_exist(self, value): 45 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 46 | hashs = self.get_hashs(value) 47 | exist = True 48 | for hash in hashs: 49 | if self.redis: 50 | exist = exist & self.redis.getbit(name, hash) 51 | else: 52 | exist = exist & self.bitset[hash] 53 | return exist 54 | 55 | def get_hashs(self, value): 56 | hashs = list() 57 | for seed in self.seeds: 58 | hash = mmh3.hash(value, seed) 59 | if hash >= 0: 60 | hashs.append(hash) 61 | else: 62 | hashs.append(self.N - hash) 63 | return hashs 64 | 65 | 66 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 67 | conn = redis.StrictRedis(connection_pool=pool) 68 | 69 | if __name__ == "__main__": 70 | start = time.time() 71 | bf = BloomFilter(conn=conn) 72 | bf.add('test') 73 | bf.add('fsest1') 74 | print(bf.is_exist('qest')) 75 | print(bf.is_exist('testdsad')) 76 | end = time.time() 77 | print(end-start) 78 | -------------------------------------------------------------------------------- /Zhihu/libs/chaojiying.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | 4 | import requests 5 | from hashlib import md5 6 | 7 | 8 | class Chaojiying_Client(object): 9 | def __init__(self, username, password, soft_id): 10 | self.username = username 11 | password = password.encode('utf8') 12 | self.password = md5(password).hexdigest() 13 | self.soft_id = soft_id 14 | self.base_params = { 15 | 'user': self.username, 16 | 'pass2': self.password, 17 | 'softid': self.soft_id, 18 | } 19 | self.headers = { 20 | 'Connection': 'Keep-Alive', 21 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', 22 | } 23 | 24 | def PostPic(self, im, codetype): 25 | """ 26 | im: 图片字节 27 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 28 | """ 29 | params = { 30 | 'codetype': codetype, 31 | } 32 | params.update(self.base_params) 33 | files = {'userfile': ('ccc.jpg', im)} 34 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) 35 | return r.json() 36 | 37 | def ReportError(self, im_id): 38 | """ 39 | im_id:报错题目的图片ID 40 | """ 41 | params = { 42 | 'id': im_id, 43 | } 44 | params.update(self.base_params) 45 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) 46 | return r.json() 47 | 48 | 49 | if __name__ == '__main__': 50 | chaojiying = Chaojiying_Client('Yanxueshan', 'lingtian..1021', '898966') 51 | im = open('verify.jpg', 'rb').read() 52 | print(chaojiying.PostPic(im, 1005)['pic_str']) 53 | -------------------------------------------------------------------------------- /Zhihu/libs/common.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 本模块用于编写一些用于Scrapy中的一些可用函数 3 | ''' 4 | import hashlib 5 | import re 6 | from zheye import zheye 7 | 8 | __author__ = 'Yan' 9 | __date__ = '2019/3/25 20:56' 10 | 11 | 12 | def get_md5(url): 13 | ''' 14 | 将url进行md5哈希,返回固定长度的字符串 15 | ''' 16 | if isinstance(url, str): 17 | url = url.encode('utf-8') 18 | return hashlib.md5(url).hexdigest() 19 | 20 | 21 | def get_position(captcha): 22 | ''' 23 | 识别知乎倒立文字验证码,返回倒立文字所在坐标 24 | ''' 25 | z = zheye() 26 | positions = z.Recognize(captcha) 27 | result = [] 28 | if len(positions) == 2: 29 | # two inverted characters 30 | if positions[0][1] > positions[1][1]: 31 | result.append([positions[1][1], positions[1][0]]) 32 | result.append([positions[0][1], positions[0][0]]) 33 | else: 34 | result.append([positions[0][1], positions[0][0]]) 35 | result.append([positions[1][1], positions[1][0]]) 36 | else: 37 | # one inverted characters 38 | result.append([positions[0][1], positions[0][0]]) 39 | return result 40 | 41 | 42 | def extract_nums(text): 43 | ''' 44 | 从text中提取出数字 45 | ''' 46 | text = text.replace(',', '') 47 | re_match = re.match('.*?(\d+).*', text) 48 | nums = 0 49 | if re_match: 50 | nums = re_match.group(1) 51 | return int(nums) 52 | 53 | 54 | if __name__ == "__main__": 55 | result = get_position('../zhihu_image/a.gif') 56 | print(result) 57 | -------------------------------------------------------------------------------- /Zhihu/libs/proxy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from scrapy.selector import Selector 3 | import redis 4 | import time 5 | 6 | __author__ = 'Yan' 7 | __date__ = '2019/4/1 7:50' 8 | 9 | 10 | class Fetch_Proxy(object): 11 | ''' 12 | 从西刺网站获取免费ip代理 13 | ''' 14 | def __init__(self): 15 | self.redis = redis.Redis(host='127.0.0.1', port=6379, db=0) 16 | self.redis_key = "proxy" 17 | 18 | def get_ip_list(self, pages): 19 | ''' 20 | 获取ip_list列表 21 | ''' 22 | headers = { 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' 24 | } 25 | for page in range(1, pages): 26 | url = 'https://www.xicidaili.com/wt/' + str(page) 27 | res = requests.get(url, headers=headers) 28 | selector = Selector(text=res.text) 29 | results = selector.css('#ip_list tr') 30 | for result in results[1:]: 31 | ip = result.css('td::text')[0].extract() 32 | port = result.css('td::text')[1].extract() 33 | proxy = ip + ':' + port 34 | self.redis.sadd(self.redis_key, proxy) 35 | 36 | def judge(self, proxy): 37 | ''' 38 | 判断ip是否可以用 39 | ''' 40 | proxy_dict = {'http': "http://" + proxy} 41 | try: 42 | res = requests.get('https://www.baidu.com', proxies=proxy_dict) 43 | except Exception: 44 | print('该proxy:' + proxy + '无效') 45 | return False 46 | else: 47 | if res.status_code == 200: 48 | return True 49 | else: 50 | print('该proxy:' + proxy + '无效') 51 | self.redis.srem(self.redis_key, proxy) 52 | return False 53 | 54 | def insert_ip(self, proxy): 55 | ''' 56 | 往redis中添加数据 57 | ''' 58 | self.redis.sadd(self.redis_key, proxy) 59 | 60 | def delete_ip(self, proxy): 61 | ''' 62 | 从redis中删除无效ip 63 | ''' 64 | self.redis.srem(self.redis_key, proxy) 65 | 66 | def get_random_ip(self): 67 | ''' 68 | 从redis中随机获取一个proxy 69 | ''' 70 | if self.redis.scard(self.redis_key) < 50: 71 | self.get_ip_list(5) 72 | proxy = self.redis.srandmember(self.redis_key, 1)[0].decode('utf8') 73 | result = self.judge(proxy) 74 | if result: 75 | return "http://" + proxy 76 | else: 77 | self.get_random_ip() 78 | 79 | 80 | if __name__ == "__main__": 81 | start_time = time.time() 82 | fetch = Fetch_Proxy() 83 | print(fetch.get_random_ip()) 84 | print("time cost: ", time.time()-start_time) 85 | -------------------------------------------------------------------------------- /Zhihu/libs/zheye_test.py: -------------------------------------------------------------------------------- 1 | from zheye import zheye 2 | 3 | z = zheye() 4 | positions = z.Recognize('../zhihu_image/a.gif') 5 | print(positions) 6 | -------------------------------------------------------------------------------- /Zhihu/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from fake_useragent import UserAgent 3 | from settings import USER_AGENT_LIST, BASE_DIR 4 | import random 5 | from scrapy.http import HtmlResponse 6 | import os 7 | import pickle 8 | from libs.proxy import Fetch_Proxy 9 | 10 | 11 | class ZhihuSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ZhihuDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | 105 | 106 | class RandomUserAgentDownloaderMiddleware(object): 107 | ''' 108 | randomly generated user-agent 109 | ''' 110 | def __init__(self, crawler): 111 | # self.user_agent = UserAgent() 112 | self.user_agent_list = crawler.settings.get('USER_AGENT_LIST', []) 113 | super().__init__() 114 | 115 | @classmethod 116 | def from_crawler(cls, crawler): 117 | ''' 118 | get crawler 119 | ''' 120 | return cls(crawler) 121 | 122 | def process_request(self, request, spider): 123 | ''' 124 | process request --> add random user-agent to request's headers 125 | ''' 126 | user_agent = random.choice(self.user_agent_list) 127 | print("Using User-Agent: ", user_agent) 128 | request.headers.setdefault("User-Agent", user_agent) 129 | 130 | 131 | class ProxyDownloaderMiddleware(object): 132 | ''' 133 | 设置IP代理 134 | ''' 135 | def __init__(self, crawler): 136 | self.fetch = Fetch_Proxy() 137 | super().__init__() 138 | 139 | @classmethod 140 | def from_crawler(cls, crawler): 141 | ''' 142 | 自己写Middleware必须实现的函数,manager会自主调用 143 | ''' 144 | return cls(crawler) 145 | 146 | def process_request(self, request, spider): 147 | ''' 148 | process request --> add random user-agent to request's headers 149 | ''' 150 | proxy = self.fetch.get_random_ip() 151 | print("Using proxy: ", proxy) 152 | request.meta["proxy"] = "http://" + proxy 153 | 154 | 155 | class RedirectDealDownloaderMiddleware(object): 156 | ''' 157 | 处理知乎302重定向问题以及最初cookies传递问题 158 | ''' 159 | def process_response(self, request, response, spider): 160 | ''' 161 | deal with 302 162 | ''' 163 | if response.status == 302 and 'signup' in response.url: 164 | cookies = spider.get_cookies() 165 | cookies_dict = {} 166 | for cookie in cookies: 167 | cookies_dict[cookie["name"]] = cookie["value"] 168 | 169 | headers = { 170 | 'set-cookie': cookies_dict 171 | } 172 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, 173 | encoding='utf8', request=request, headers=headers) 174 | if 'signin' in response.url: 175 | cookies = [] 176 | if os.path.exists(BASE_DIR+'/Zhihu/cookies/zhihu.cookies'): 177 | cookies = pickle.load(open(BASE_DIR+'/Zhihu/cookies/zhihu.cookies', 'rb')) 178 | 179 | if not cookies: 180 | cookies = spider.get_cookies() 181 | 182 | cookies_dict = {} 183 | for cookie in cookies: 184 | cookies_dict[cookie["name"]] = cookie["value"] 185 | 186 | headers = { 187 | 'set-cookie': cookies_dict 188 | } 189 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, 190 | encoding='utf8', request=request, headers=headers) 191 | return response 192 | -------------------------------------------------------------------------------- /Zhihu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pymysql.cursors import DictCursor 3 | from twisted.enterprise import adbapi 4 | 5 | 6 | class MySQLTwistedPipeline(object): 7 | ''' 8 | 将MySQL插入操作变成异步化 9 | ''' 10 | def __init__(self, db_pool): 11 | self.db_pool = db_pool 12 | 13 | @classmethod 14 | def from_settings(cls, settings): 15 | ''' 16 | create db_pool 17 | ''' 18 | db_parameters = dict( 19 | host=settings["MYSQL_HOST"], 20 | db=settings["MYSQL_DBNAME"], 21 | user=settings["MYSQL_USER"], 22 | passwd=settings["MYSQL_PASSWORD"], 23 | charset="utf8", 24 | cursorclass=DictCursor, 25 | use_unicode=True 26 | ) 27 | db_pool = adbapi.ConnectionPool("pymysql", **db_parameters) 28 | return cls(db_pool) 29 | 30 | def process_item(self, item, spider): 31 | ''' 32 | process item 33 | ''' 34 | query = self.db_pool.runInteraction(self.do_insert, item) 35 | query.addErrback(self.handle_error, item, spider) 36 | 37 | def handle_error(self, failure, item, spider): 38 | ''' 39 | handle error of insert to mysql 40 | ''' 41 | print(failure) 42 | 43 | def do_insert(self, cursor, item): 44 | ''' 45 | insert data into the database 46 | ''' 47 | insert_sql, parameters = item.get_insert_sql() 48 | cursor.execute(insert_sql, parameters) 49 | -------------------------------------------------------------------------------- /Zhihu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | 5 | BOT_NAME = 'Zhihu' 6 | 7 | SPIDER_MODULES = ['Zhihu.spiders'] 8 | NEWSPIDER_MODULE = 'Zhihu.spiders' 9 | 10 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 11 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 12 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 13 | # USER_AGENT = 'Zhihu (+http://www.yourdomain.com)' 14 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' 15 | 16 | # Obey robots.txt rules 17 | ROBOTSTXT_OBEY = False 18 | 19 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 20 | # CONCURRENT_REQUESTS = 32 21 | 22 | # Configure a delay for requests for the same website (default: 0) 23 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 24 | # See also autothrottle settings and docs 25 | DOWNLOAD_DELAY = 1 26 | # The download delay setting will honor only one of: 27 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 28 | # CONCURRENT_REQUESTS_PER_IP = 16 29 | 30 | # Disable cookies (enabled by default) 31 | COOKIES_ENABLED = True 32 | 33 | # Disable Telnet Console (enabled by default) 34 | # TELNETCONSOLE_ENABLED = False 35 | 36 | # Override the default request headers: 37 | # DEFAULT_REQUEST_HEADERS = { 38 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 39 | # 'Accept-Language': 'en', 40 | # } 41 | 42 | # Enable or disable spider middlewares 43 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 44 | # SPIDER_MIDDLEWARES = { 45 | # 'Zhihu.middlewares.ZhihuSpiderMiddleware': 543, 46 | # } 47 | 48 | # Enable or disable downloader middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 50 | DOWNLOADER_MIDDLEWARES = { 51 | 'Zhihu.middlewares.ZhihuDownloaderMiddleware': 543, 52 | 'Zhihu.middlewares.RedirectDealDownloaderMiddleware': 3, 53 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 54 | 'Zhihu.middlewares.RandomUserAgentDownloaderMiddleware': 1, 55 | 'Zhihu.middlewares.ProxyDownloaderMiddleware': 2, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | # 'Zhihu.pipelines.ZhihuPipeline': 300, 68 | 'scrapy_redis.pipelines.RedisPipeline': 300, 69 | 'Zhihu.items.ZhihuAnswerItem': 1, 70 | 'Zhihu.items.ZhihuQuestionItem': 2, 71 | 'Zhihu.pipelines.MySQLTwistedPipeline': 3, 72 | } 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | # AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | # AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | # AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | # AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | # HTTPCACHE_ENABLED = True 90 | # HTTPCACHE_EXPIRATION_SECS = 0 91 | # HTTPCACHE_DIR = 'httpcache' 92 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | MYSQL_HOST = 'localhost' 95 | MYSQL_DBNAME = 'zhihu' 96 | MYSQL_USER = 'root' 97 | MYSQL_PASSWORD = 'root' 98 | 99 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) 100 | sys.path.insert(0, os.path.join(BASE_DIR, 'Zhihu')) 101 | # print(BASE_DIR) 102 | 103 | ZHIHU_ACCOUNT = 'username' 104 | ZHIHU_PASSWORD = 'password' 105 | 106 | CHAOJIYING_ACCOUNT = 'username' 107 | CHAOJIYING_PASSWORD = 'password' 108 | CAPTCHA_TYPE = '898966' 109 | 110 | SQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' 111 | SQL_DATE_FORMAT = '%Y-%m-%d' 112 | 113 | USER_AGENT_LIST = [ 114 | # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", 115 | "Opera/8.0 (Windows NT 5.1; U; en)", 116 | "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", 117 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", 118 | # Firefox 119 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", 120 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 121 | # Safari 122 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", 123 | # chrome 124 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", 125 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 126 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", 127 | # 360 128 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 129 | "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", 130 | # 淘宝浏览器 131 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 132 | # 猎豹浏览器 133 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 134 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 135 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 136 | # QQ浏览器 137 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 138 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 139 | # sogou浏览器 140 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", 141 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", 142 | # maxthon浏览器 143 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", 144 | # UC浏览器 145 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 146 | ] 147 | -------------------------------------------------------------------------------- /Zhihu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Zhihu/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import os 5 | import json 6 | import datetime 7 | import time 8 | import pickle 9 | import base64 10 | import mouse 11 | from settings import BASE_DIR, ZHIHU_ACCOUNT, ZHIHU_PASSWORD, CHAOJIYING_ACCOUNT, CHAOJIYING_PASSWORD, CAPTCHA_TYPE 12 | from scrapy.xlib.pydispatch import dispatcher 13 | from scrapy import signals 14 | from scrapy.loader import ItemLoader 15 | from selenium import webdriver 16 | from selenium.webdriver.chrome.options import Options 17 | from selenium.webdriver.common.keys import Keys 18 | from items import ZhihuAnswerItem, ZhihuQuestionItem 19 | from urllib import parse 20 | from libs.common import get_md5, get_position, extract_nums 21 | from libs.chaojiying import Chaojiying_Client 22 | from scrapy_redis.spiders import RedisSpider 23 | from scrapy_redis.utils import bytes_to_str 24 | 25 | 26 | class ZhihuSpider(RedisSpider): 27 | ''' 28 | ZhihuSpier --> get question and answer from www.zhihu.com 29 | ''' 30 | name = 'zhihu' 31 | allowed_domains = ['www.zhihu.com'] 32 | redis_key = 'zhihu:start_urls' 33 | # start_urls = ['https://www.zhihu.com/'] 34 | start_answer_url = 'https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit={1}&offset={2}&platform=desktop&sort_by=default' 35 | # scrapy默认处理 >=200 并且 <300 的URL,其他的会过滤掉,handle_httpstatus_list表示对返回这些状态码的URL不过滤,自己处理 36 | handle_httpstatus_list = [302, 400, 403, 404, 500] 37 | 38 | def __init__(self): 39 | # scrapy集成selenium 40 | # chromedriver中有一些js变量会暴露,被服务器识别出来,所以保险起见,可以手动启动chromedriver 41 | # 1. 找到chrome.exe文件所在路径,cmd中进入该路径,执行chrome.exe --remote-debugging-port=9222 42 | # 2. 执行下列语句(执行第一步后要保证127.0.0.1:9222/json能够正常访问,在这之前需要退出所有的chrome) 43 | chrome_opt = Options() 44 | chrome_opt.add_argument("--disable-extensions") 45 | chrome_opt.add_experimental_option("debuggerAddress", "127.0.0.1:9222") 46 | self.browser = webdriver.Chrome(executable_path="C:/Users/晏乐/Desktop/Lagou/chromedriver", 47 | chrome_options=chrome_opt) 48 | 49 | # crawl_url_count: 用来统计爬取URL的总数 50 | self.crawl_url_count = 0 51 | 52 | # 数据收集,收集Scrapy运行过程中302/403/404页面URL及URL数量 53 | # failed_url: 用来存放302/403/404页面URL 54 | self.failed_urls = [] 55 | 56 | # 信号处理,当爬虫退出时执行spider_closed方法 57 | dispatcher.connect(self.spider_closed, signals.spider_closed) 58 | 59 | # 信号处理,当引擎从downloader中获取到一个新的Response对象时调用get_crawl_url_count方法 60 | dispatcher.connect(self.get_crawl_url_count, signals.response_received) 61 | 62 | super().__init__() 63 | 64 | # def start_requests(self): 65 | # cookies = [] 66 | # if os.path.exists(BASE_DIR+'/Zhihu/cookies/zhihu.cookies'): 67 | # cookies = pickle.load(open(BASE_DIR+'/Zhihu/cookies/zhihu.cookies', 'rb')) 68 | 69 | # if not cookies: 70 | # cookies = self.get_cookies() 71 | 72 | # cookies_dict = {} 73 | # for cookie in cookies: 74 | # cookies_dict[cookie["name"]] = cookie["value"] 75 | 76 | # use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) 77 | # fetch_one = self.server.lpop 78 | # data = fetch_one(self.redis_key) 79 | # url = bytes_to_str(data, self.redis_encoding) 80 | # for url in self.start_urls: 81 | # yield scrapy.Request(url, dont_filter=True, cookies=cookies_dict) 82 | 83 | def parse(self, response): 84 | ''' 85 | parse response --> get question's url 86 | ''' 87 | # get all urls and filter 88 | all_urls = response.css("a::attr(href)").extract() 89 | all_urls = [parse.urljoin(response.url, url) for url in all_urls] 90 | all_urls = list(filter(lambda url: True if url.startswith("http") else False, all_urls)) 91 | 92 | for url in all_urls: 93 | re_match = re.match('(.*?zhihu.com/question/(\d+)).*', url) 94 | if re_match: 95 | request_url = re_match.group(1) 96 | question_id = re_match.group(2) 97 | yield scrapy.Request(url=request_url, meta={"question_id": question_id}, callback=self.parse_question) 98 | break 99 | else: 100 | yield scrapy.Request(url=url, callback=self.parse) 101 | 102 | def parse_question(self, response): 103 | ''' 104 | parse question 105 | ''' 106 | if response.status in self.handle_httpstatus_list: 107 | self.failed_urls.append(response.url) 108 | # 数据收集,当Response状态码为403/404/500时,failed_url数加1 109 | self.crawler.stats.inc_value("failed_url") 110 | 111 | question_item = ZhihuQuestionItem() 112 | question_id = int(response.meta.get("question_id")) 113 | title = response.css('.QuestionHeader-title::text').extract_first('') 114 | question_url = response.url 115 | topics = response.css('meta[itemprop="keywords"]::attr(content)').extract() 116 | topics = '/'.join(topics) 117 | content = response.css('.QuestionRichText--collapsed div span::text').extract_first('') 118 | answer_nums = response.css('.List-headerText span::text').extract_first('') 119 | answer_nums = extract_nums(answer_nums) 120 | comment_nums = response.css('.QuestionHeader-Comment button::text').extract_first('') 121 | comment_nums = extract_nums(comment_nums) 122 | watch_user_nums = response.css('.NumberBoard-itemValue::text').extract_first('') 123 | watch_user_nums = extract_nums(watch_user_nums) 124 | click_nums = response.css('.NumberBoard-itemValue::text').extract()[1] 125 | click_nums = extract_nums(click_nums) 126 | crawl_time = datetime.datetime.now() 127 | 128 | question_item["question_id"] = question_id 129 | question_item["topics"] = topics 130 | question_item["question_url"] = question_url 131 | question_item["title"] = title 132 | question_item["content"] = content 133 | question_item["answer_nums"] = answer_nums 134 | question_item["comment_nums"] = comment_nums 135 | question_item["watch_user_nums"] = watch_user_nums 136 | question_item["click_nums"] = click_nums 137 | question_item["crawl_time"] = crawl_time 138 | 139 | yield question_item 140 | yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0), callback=self.parse_answer) 141 | 142 | def parse_answer(self, response): 143 | ''' 144 | parse answer 145 | ''' 146 | if response.status in self.handle_httpstatus_list: 147 | self.failed_urls.append(response.url) 148 | # 数据收集,当Response状态码为403/404/500时,failed_url数加1 149 | self.crawler.stats.inc_value("failed_url") 150 | 151 | answer_dcit = json.loads(response.text) 152 | is_end = answer_dcit['paging']['is_end'] 153 | next_url = answer_dcit['paging']['next'] 154 | 155 | for answer in answer_dcit['data']: 156 | answer_item = ZhihuAnswerItem() 157 | answer_item["answer_id"] = answer['id'] 158 | answer_item["question_id"] = answer['question']['id'] 159 | answer_item["answer_url"] = answer['url'] 160 | answer_item["author_id"] = answer['author']['id'] if 'id' in answer['author'] else '' 161 | answer_item["content"] = answer['content'] 162 | answer_item["praise_nums"] = answer['voteup_count'] 163 | answer_item["comment_nums"] = answer['comment_count'] 164 | answer_item["create_time"] = answer['created_time'] 165 | answer_item["update_time"] = answer['updated_time'] 166 | answer_item["crawl_time"] = datetime.datetime.now() 167 | question_create_time = answer['question']['created'] 168 | question_update_time = answer['question']['updated_time'] 169 | yield answer_item 170 | 171 | if not is_end: 172 | yield scrapy.Request(next_url, callback=self.parse_answer) 173 | 174 | def spider_closed(self, spider): 175 | ''' 176 | 当爬虫退出时关闭chrome,收集爬取失败(302/403/404)的URL,并写入json文件中 177 | ''' 178 | self.browser.quit() 179 | self.crawler.stats.set_value("failed_urls", ','.join(self.failed_urls)) 180 | failed_url_dict = {'failed_urls': self.failed_urls} 181 | json_str = json.dumps(failed_url_dict) 182 | with open(BASE_DIR+"/Zhihu/failed_urls/failed_urls.json", 'w') as f: 183 | f.write(json_str) 184 | 185 | def get_crawl_url_count(self, spider): 186 | ''' 187 | 当引擎engine从downloader中获取到一个新的Response对象时调用,crawl_url_count+=1 188 | ''' 189 | self.crawl_url_count += 1 190 | print("截至目前已爬取URL总数为: ", self.crawl_url_count) 191 | return self.crawl_url_count 192 | 193 | def get_cookies(self): 194 | ''' 195 | get cookies from www.zhihu.com 196 | ''' 197 | # 1. maximize the browser window 198 | try: 199 | self.browser.maximize_window() 200 | except Exception: 201 | pass 202 | 203 | # 2. login simulation 204 | self.browser.get("https://www.zhihu.com/signin") 205 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(Keys.CONTROL + "a") 206 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(ZHIHU_ACCOUNT) 207 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a") 208 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(ZHIHU_PASSWORD) 209 | self.browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click() 210 | time.sleep(5) 211 | 212 | login_success = False 213 | while not login_success: 214 | # if login failed, login again --> captcha identification --> english captcha and chinese captcha 215 | 216 | # if login success, can find element with the class_name GlobalWrite-navTitle 217 | try: 218 | notify_ele = self.browser.find_element_by_class_name("GlobalWrite-navTitle") 219 | login_success = True 220 | break 221 | except Exception: 222 | pass 223 | 224 | # find english captcha or chinese captcha 225 | try: 226 | english_captcha_element = self.browser.find_element_by_class_name("Captcha-englishImg") 227 | except Exception: 228 | english_captcha_element = None 229 | try: 230 | chinese_captcha_element = self.browser.find_element_by_class_name("Captcha-chineseImg") 231 | except Exception: 232 | chinese_captcha_element = None 233 | 234 | # deal with chinese captcha 235 | if chinese_captcha_element: 236 | self.deal_with_chinese_captcha(chinese_captcha_element) 237 | 238 | # deal with english captcha 239 | if english_captcha_element: 240 | self.deal_with_english_captcha(english_captcha_element) 241 | 242 | if login_success: 243 | # if login success, get cookies and write cookies to a file 244 | cookies = self.browser.get_cookies() 245 | pickle.dump(cookies, open(BASE_DIR+"/Zhihu/cookies/zhihu.cookies", 'wb')) 246 | 247 | return cookies 248 | 249 | def deal_with_chinese_captcha(self, chinese_captcha_element): 250 | ''' 251 | deal with chinese captcha 252 | ''' 253 | # get chinese captcha image coordinate 254 | ele_position = chinese_captcha_element.location 255 | x_coordinate = ele_position['x'] 256 | y_coordinate = ele_position['y'] 257 | browser_navigation_panel_height = self.browser.execute_script( 258 | "return window.outerHeight - window.innerHeight;" 259 | ) 260 | 261 | # find chinese captcha image and write to a file 262 | base64_text = chinese_captcha_element.get_attribute("src") 263 | code = base64_text.replace("data:image/jpg;base64,", "").replace("%0A", "") 264 | with open(BASE_DIR+'/Zhihu/captcha/chinese_captcha.jpeg', 'wb') as f: 265 | f.write(base64.b64decode(code)) 266 | 267 | # deal with chinese captcha 268 | positions = get_position(BASE_DIR+'/Zhihu/captcha/chinese_captcha.jpeg') 269 | if len(positions) == 2: 270 | first_position = [positions[0][0] // 2, positions[0][1] // 2] 271 | second_position = [positions[1][0] // 2, positions[1][1] // 2] 272 | 273 | # click first inverted character 274 | mouse.move( 275 | x_coordinate+first_position[0], 276 | y_coordinate+browser_navigation_panel_height+first_position[1] 277 | ) 278 | mouse.click() 279 | 280 | # click second inverted character 281 | time.sleep(2) 282 | mouse.move( 283 | x_coordinate+second_position[0], 284 | y_coordinate+browser_navigation_panel_height+second_position[1] 285 | ) 286 | mouse.click() 287 | else: 288 | first_position = [positions[0][0] // 2, positions[0][1] // 2] 289 | mouse.move( 290 | x_coordinate+first_position[0], 291 | y_coordinate+browser_navigation_panel_height+first_position[1] 292 | ) 293 | mouse.click() 294 | 295 | # input account and password again 296 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(Keys.CONTROL + "a") 297 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(ZHIHU_ACCOUNT) 298 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a") 299 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(ZHIHU_PASSWORD) 300 | self.browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click() 301 | time.sleep(5) 302 | 303 | def deal_with_english_captcha(self, english_captcha_element): 304 | ''' 305 | deal with english captcha 306 | ''' 307 | # find english captcha image and write to a file 308 | base64_text = english_captcha_element.get_attribute("src") 309 | code = base64_text.replace("data:image/jpg;base64,", "").replace("%0A", "") 310 | with open(BASE_DIR+'/Zhihu/captcha/english_captcha.jpeg', 'wb') as f: 311 | f.write(base64.b64decode(code)) 312 | 313 | # deal with english captcha 314 | chaojiying = Chaojiying_Client(CHAOJIYING_ACCOUNT, CHAOJIYING_PASSWORD, CAPTCHA_TYPE) 315 | with open(BASE_DIR+'/Zhihu/captcha/english_captcha.jpeg', 'rb') as f: 316 | im = f.read() 317 | result = chaojiying.PostPic(im, 1005)['pic_str'] 318 | self.browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(Keys.CONTROL + 'a') 319 | self.browser.find_element_by_css_selector('.Input-wrapper input[name="captcha"]').send_keys(result) 320 | 321 | # input account and password again 322 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(Keys.CONTROL + "a") 323 | self.browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(ZHIHU_ACCOUNT) 324 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a") 325 | self.browser.find_element_by_css_selector(".SignFlow-password input").send_keys(ZHIHU_PASSWORD) 326 | self.browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click() 327 | time.sleep(5) 328 | -------------------------------------------------------------------------------- /Zhihu/zheye/Kaiti-SC-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/Kaiti-SC-Bold.ttf -------------------------------------------------------------------------------- /Zhihu/zheye/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | # Recognizing class 4 | 5 | from sklearn.mixture import GaussianMixture 6 | from PIL import Image 7 | from zheye import util 8 | import numpy as np 9 | 10 | class zheye: 11 | def __init__(self): 12 | ''' load model ''' 13 | import os 14 | import keras 15 | full_path = os.path.realpath(__file__) 16 | path, filename = os.path.split(full_path) 17 | self.model = keras.models.load_model(path +'/zheyeV3.keras') 18 | 19 | def Recognize(self, fn): 20 | im = Image.open(fn) 21 | im = util.CenterExtend(im, radius=20) 22 | 23 | vec = np.asarray(im.convert('L')).copy() 24 | Y = [] 25 | for i in range(vec.shape[0]): 26 | for j in range(vec.shape[1]): 27 | if vec[i][j] <= 200: 28 | Y.append([i, j]) 29 | 30 | gmm = GaussianMixture(n_components=7, covariance_type='tied', reg_covar=1e2, tol=1e3, n_init=9) 31 | gmm.fit(Y) 32 | 33 | centers = gmm.means_ 34 | 35 | points = [] 36 | for i in range(7): 37 | scoring = 0.0 38 | for w_i in range(3): 39 | for w_j in range(3): 40 | p_x = centers[i][0] -1 +w_i 41 | p_y = centers[i][1] -1 +w_j 42 | 43 | cr = util.crop(im, p_x, p_y, radius=20) 44 | cr = cr.resize((40, 40), Image.ANTIALIAS) 45 | 46 | X = np.asarray(cr.convert('L'), dtype='float') 47 | X = (X.astype("float") - 180) /200 48 | 49 | x0 = np.expand_dims(X, axis=0) 50 | x1 = np.expand_dims(x0, axis=3) 51 | 52 | global model 53 | if self.model.predict(x1)[0][0] < 0.5: 54 | scoring += 1 55 | 56 | if scoring > 4: 57 | points.append((centers[i][0] -20, centers[i][1] -20)) 58 | 59 | return points -------------------------------------------------------------------------------- /Zhihu/zheye/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/zheye/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /Zhihu/zheye/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | from PIL import Image, ImageFont, ImageDraw 4 | import numpy as np 5 | 6 | from random import randint, choice 7 | from math import sin, cos, radians, fabs 8 | 9 | import os 10 | dir_path = os.path.dirname(os.path.realpath(__file__)) 11 | 12 | def crop(im, y, x, radius = 20): 13 | return im.crop((x-radius, y-radius, x+radius, y+radius)) 14 | 15 | def PaintPoint(image, points=[]): 16 | im = image.copy() 17 | bgdr = ImageDraw.Draw(im) 18 | for y, x in points: 19 | bgdr.ellipse((x-3, y-3, x+3, y+3), fill ="red", outline ='red') 20 | return im 21 | 22 | def RandomGenerateOneChar(y=None, character=None, radius=20): 23 | ''' 24 | y == 1 汉字正 25 | y ==-1 汉字倒 26 | radius < 50 27 | ''' 28 | choices = range(-30, 30) + range(-180, -150) + range(150, 180) 29 | 30 | angle = choice(choices) 31 | if y != None: 32 | while (angle <= 30 and angle >= -30) == (y == -1): 33 | angle = choice(choices) 34 | else: 35 | y = -1 36 | if angle <= 30 and angle >= -30: 37 | y = 1 38 | 39 | rad = radians(angle) 40 | if character == None: 41 | character = RandomGB2312() 42 | 43 | background = Image.new("RGBA", (160, 160), (255,255,255,255)) 44 | 45 | im = Image.new("RGBA", (72, 82), (0, 0, 0, 0)) 46 | global dir_path 47 | font = ImageFont.truetype(dir_path + "/Kaiti-SC-Bold.ttf", 72) 48 | 49 | dr = ImageDraw.Draw(im) 50 | dr.fontmode = "1" 51 | dr.text((0, 0), character, font=font, fill="#000000") 52 | 53 | fore = im.rotate(angle, expand=1) 54 | width, height = fore.size 55 | 56 | scale = np.random.uniform(0.8, 1.2) 57 | fore = fore.resize((int(width *scale), int(height*scale)), Image.ANTIALIAS) 58 | width, height = fore.size 59 | 60 | background.paste(fore, (80 - width/2 + randint(-10, 10), 80 -10*y - height/2 + randint(-10, 10)), fore) 61 | return background.crop((80-radius, 80-radius, 80+radius, 80+radius)) 62 | 63 | def RandomGB2312(): 64 | ''' 65 | 来自 66 | http://blog.3gcnbeta.com/2010/02/08/ 67 | python-%E9%9A%8F%E6%9C%BA%E7%94%9F%E6%88%90%E4%B8%AD%E6%96%87%E7%9A%84%E4%BB%A3%E7%A0%81/ 68 | 69 | 有bug 70 | ''' 71 | head = randint(0xB0, 0xDF) 72 | body = randint(0xA, 0xF) 73 | tail = randint(0, 0xF) 74 | val = ( head << 0x8 ) | (body << 0x4 ) | tail 75 | str = '%x' % val 76 | try: 77 | return str.decode('hex').decode('gb2312') 78 | except: 79 | return RandomGB2312() 80 | 81 | def Img2Vec(im): 82 | return np.asarray(im.convert('L')) 83 | 84 | def Vec2Ascii(x): 85 | import sys 86 | for i in x: 87 | for j in i: 88 | #if j > 0: 89 | if j > 200: 90 | sys.stdout.write('+') 91 | else: 92 | sys.stdout.write(' ') 93 | print 94 | 95 | def CenterExtend(im, width=400, height=88, radius=20): 96 | x1 = np.full((height+radius+radius, width+radius+radius), 255, dtype='uint8') 97 | x2 = np.asarray(im.convert('L')) 98 | x1[radius:radius+height,radius:radius+width] = x2 99 | return Image.fromarray(x1, 'L') 100 | 101 | if __name__ == '__main__': 102 | pass 103 | -------------------------------------------------------------------------------- /Zhihu/zheye/zheyeV3.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/zheyeV3.keras -------------------------------------------------------------------------------- /Zhihu/zheye/zheyeV4.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/zheyeV4.keras -------------------------------------------------------------------------------- /Zhihu/zheye/zheyeV5.keras: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zheye/zheyeV5.keras -------------------------------------------------------------------------------- /Zhihu/zhihu_image/a.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/a.gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/b.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/b.gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/c.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/c.gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/captcha (10).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/captcha (10).gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/captcha (12).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/captcha (12).gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/captcha (4).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/captcha (4).gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/captcha (6).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/captcha (6).gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/captcha-3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/captcha-3.gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/d.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/d.gif -------------------------------------------------------------------------------- /Zhihu/zhihu_image/e.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/Zhihu/zhihu_image/e.gif -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from scrapy.cmdline import execute 4 | 5 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 6 | execute(['scrapy', 'crawl', 'zhihu']) 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BitVector==3.4.8 2 | fake-useragent==0.1.11 3 | h5py==2.9.0 4 | Keras==2.0.1 5 | mmh3==2.5.1 6 | mouse==0.7.0 7 | numpy==1.16.2+mkl 8 | Pillow==5.4.1 9 | PyMySQL==0.8.0 10 | redis==2.10.6 11 | requests==2.18.4 12 | scikit-learn==0.20.3 13 | scipy==1.2.1 14 | Scrapy==1.6.0 15 | selenium==3.141.0 16 | tensorflow==1.13.1 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Zhihu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Zhihu 12 | -------------------------------------------------------------------------------- /scrapy_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .connection import ( # NOQA 3 | get_redis, 4 | get_redis_from_settings, 5 | ) 6 | 7 | 8 | __author__ = 'Rolando Espinoza' 9 | __email__ = 'rolando at rmax.io' 10 | __version__ = '0.7.0-dev' 11 | -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/connection.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/connection.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/defaults.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/defaults.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/dupefilter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/dupefilter.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/picklecompat.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/picklecompat.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/queue.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/queue.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/scheduler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/scheduler.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/spiders.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/spiders.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yanxueshan/Scrapy-Redis-Zhihu/9f1cc208a0dda127101397a9277af3696a78b1b1/scrapy_redis/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import defaults 6 | 7 | 8 | # Shortcut maps 'setting name' -> 'parmater name'. 9 | SETTINGS_PARAMS_MAP = { 10 | 'REDIS_URL': 'url', 11 | 'REDIS_HOST': 'host', 12 | 'REDIS_PORT': 'port', 13 | 'REDIS_ENCODING': 'encoding', 14 | } 15 | 16 | 17 | def get_redis_from_settings(settings): 18 | """Returns a redis client instance from given Scrapy settings object. 19 | 20 | This function uses ``get_client`` to instantiate the client and uses 21 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 22 | can override them using the ``REDIS_PARAMS`` setting. 23 | 24 | Parameters 25 | ---------- 26 | settings : Settings 27 | A scrapy settings object. See the supported settings below. 28 | 29 | Returns 30 | ------- 31 | server 32 | Redis client instance. 33 | 34 | Other Parameters 35 | ---------------- 36 | REDIS_URL : str, optional 37 | Server connection URL. 38 | REDIS_HOST : str, optional 39 | Server host. 40 | REDIS_PORT : str, optional 41 | Server port. 42 | REDIS_ENCODING : str, optional 43 | Data encoding. 44 | REDIS_PARAMS : dict, optional 45 | Additional client parameters. 46 | 47 | """ 48 | params = defaults.REDIS_PARAMS.copy() 49 | params.update(settings.getdict('REDIS_PARAMS')) 50 | # XXX: Deprecate REDIS_* settings. 51 | for source, dest in SETTINGS_PARAMS_MAP.items(): 52 | val = settings.get(source) 53 | if val: 54 | params[dest] = val 55 | 56 | # Allow ``redis_cls`` to be a path to a class. 57 | if isinstance(params.get('redis_cls'), six.string_types): 58 | params['redis_cls'] = load_object(params['redis_cls']) 59 | 60 | return get_redis(**params) 61 | 62 | 63 | # Backwards compatible alias. 64 | from_settings = get_redis_from_settings 65 | 66 | 67 | def get_redis(**kwargs): 68 | """Returns a redis client instance. 69 | 70 | Parameters 71 | ---------- 72 | redis_cls : class, optional 73 | Defaults to ``redis.StrictRedis``. 74 | url : str, optional 75 | If given, ``redis_cls.from_url`` is used to instantiate the class. 76 | **kwargs 77 | Extra parameters to be passed to the ``redis_cls`` class. 78 | 79 | Returns 80 | ------- 81 | server 82 | Redis client instance. 83 | 84 | """ 85 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 86 | url = kwargs.pop('url', None) 87 | if url: 88 | return redis_cls.from_url(url, **kwargs) 89 | else: 90 | return redis_cls(**kwargs) 91 | -------------------------------------------------------------------------------- /scrapy_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | 4 | # For standalone use. 5 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 6 | 7 | PIPELINE_KEY = '%(spider)s:items' 8 | 9 | REDIS_CLS = redis.StrictRedis 10 | REDIS_ENCODING = 'utf-8' 11 | # Sane connection defaults. 12 | REDIS_PARAMS = { 13 | 'socket_timeout': 30, 14 | 'socket_connect_timeout': 30, 15 | 'retry_on_timeout': True, 16 | 'encoding': REDIS_ENCODING, 17 | } 18 | 19 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests' 20 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 21 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 22 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 23 | 24 | START_URLS_KEY = '%(name)s:start_urls' 25 | START_URLS_AS_SET = False 26 | -------------------------------------------------------------------------------- /scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | from . import defaults 8 | from .connection import get_redis_from_settings 9 | from libs.bloomfilter import BloomFilter, conn 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # TODO: Rename class to RedisDupeFilter. 16 | class RFPDupeFilter(BaseDupeFilter): 17 | """Redis-based request duplicates filter. 18 | 19 | This class can also be used with default Scrapy's scheduler. 20 | 21 | """ 22 | 23 | logger = logger 24 | 25 | def __init__(self, server, key, debug=False): 26 | """Initialize the duplicates filter. 27 | 28 | Parameters 29 | ---------- 30 | server : redis.StrictRedis 31 | The redis server instance. 32 | key : str 33 | Redis key Where to store fingerprints. 34 | debug : bool, optional 35 | Whether to log filtered requests. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.debug = debug 41 | self.logdupes = True 42 | self.bf = BloomFilter(conn=conn, key=key) 43 | 44 | @classmethod 45 | def from_settings(cls, settings): 46 | """Returns an instance from given settings. 47 | 48 | This uses by default the key ``dupefilter:``. When using the 49 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 50 | it needs to pass the spider name in the key. 51 | 52 | Parameters 53 | ---------- 54 | settings : scrapy.settings.Settings 55 | 56 | Returns 57 | ------- 58 | RFPDupeFilter 59 | A RFPDupeFilter instance. 60 | 61 | 62 | """ 63 | server = get_redis_from_settings(settings) 64 | # XXX: This creates one-time key. needed to support to use this 65 | # class as standalone dupefilter with scrapy's default scheduler 66 | # if scrapy passes spider on open() method this wouldn't be needed 67 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 68 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 69 | debug = settings.getbool('DUPEFILTER_DEBUG') 70 | return cls(server, key=key, debug=debug) 71 | 72 | @classmethod 73 | def from_crawler(cls, crawler): 74 | """Returns instance from crawler. 75 | 76 | Parameters 77 | ---------- 78 | crawler : scrapy.crawler.Crawler 79 | 80 | Returns 81 | ------- 82 | RFPDupeFilter 83 | Instance of RFPDupeFilter. 84 | 85 | """ 86 | return cls.from_settings(crawler.settings) 87 | 88 | def request_seen(self, request): 89 | """Returns True if request was already seen. 90 | 91 | Parameters 92 | ---------- 93 | request : scrapy.http.Request 94 | 95 | Returns 96 | ------- 97 | bool 98 | 99 | """ 100 | fp = self.request_fingerprint(request) 101 | if self.bf.is_exist(fp): 102 | return True 103 | else: 104 | self.bf.add(fp) 105 | return False 106 | # # This returns the number of values added, zero if already exists. 107 | # added = self.server.sadd(self.key, fp) 108 | # return added == 0 109 | 110 | def request_fingerprint(self, request): 111 | """Returns a fingerprint for a given request. 112 | 113 | Parameters 114 | ---------- 115 | request : scrapy.http.Request 116 | 117 | Returns 118 | ------- 119 | str 120 | 121 | """ 122 | return request_fingerprint(request) 123 | 124 | @classmethod 125 | def from_spider(cls, spider): 126 | settings = spider.settings 127 | server = get_redis_from_settings(settings) 128 | dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) 129 | key = dupefilter_key % {'spider': spider.name} 130 | debug = settings.getbool('DUPEFILTER_DEBUG') 131 | return cls(server, key=key, debug=debug) 132 | 133 | def close(self, reason=''): 134 | """Delete data on close. Called by Scrapy's scheduler. 135 | 136 | Parameters 137 | ---------- 138 | reason : str, optional 139 | 140 | """ 141 | self.clear() 142 | 143 | def clear(self): 144 | """Clears fingerprints data.""" 145 | self.server.delete(self.key) 146 | 147 | def log(self, request, spider): 148 | """Logs given request. 149 | 150 | Parameters 151 | ---------- 152 | request : scrapy.http.Request 153 | spider : scrapy.spiders.Spider 154 | 155 | """ 156 | if self.debug: 157 | msg = "Filtered duplicate request: %(request)s" 158 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 159 | elif self.logdupes: 160 | msg = ("Filtered duplicate request %(request)s" 161 | " - no more duplicates will be shown" 162 | " (see DUPEFILTER_DEBUG to show all duplicates)") 163 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 164 | self.logdupes = False 165 | -------------------------------------------------------------------------------- /scrapy_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) 15 | -------------------------------------------------------------------------------- /scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | 8 | default_serialize = ScrapyJSONEncoder().encode 9 | 10 | 11 | class RedisPipeline(object): 12 | """Pushes serialized item into a redis list/queue 13 | 14 | Settings 15 | -------- 16 | REDIS_ITEMS_KEY : str 17 | Redis key where to store items. 18 | REDIS_ITEMS_SERIALIZER : str 19 | Object path to serializer function. 20 | 21 | """ 22 | 23 | def __init__(self, server, 24 | key=defaults.PIPELINE_KEY, 25 | serialize_func=default_serialize): 26 | """Initialize pipeline. 27 | 28 | Parameters 29 | ---------- 30 | server : StrictRedis 31 | Redis client instance. 32 | key : str 33 | Redis key where to store items. 34 | serialize_func : callable 35 | Items serializer function. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.serialize = serialize_func 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | params = { 45 | 'server': connection.from_settings(settings), 46 | } 47 | if settings.get('REDIS_ITEMS_KEY'): 48 | params['key'] = settings['REDIS_ITEMS_KEY'] 49 | if settings.get('REDIS_ITEMS_SERIALIZER'): 50 | params['serialize_func'] = load_object( 51 | settings['REDIS_ITEMS_SERIALIZER'] 52 | ) 53 | 54 | return cls(**params) 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | return cls.from_settings(crawler.settings) 59 | 60 | def process_item(self, item, spider): 61 | return deferToThread(self._process_item, item, spider) 62 | 63 | def _process_item(self, item, spider): 64 | key = self.item_key(item, spider) 65 | data = self.serialize(item) 66 | self.server.rpush(key, data) 67 | return item 68 | 69 | def item_key(self, item, spider): 70 | """Returns redis key based on given spider. 71 | 72 | Override this function to use a different key depending on the item 73 | and/or spider. 74 | 75 | """ 76 | return self.key % {'spider': spider.name} 77 | -------------------------------------------------------------------------------- /scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider base queue class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters 13 | ---------- 14 | server : StrictRedis 15 | Redis client instance. 16 | spider : Spider 17 | Scrapy spider instance. 18 | key: str 19 | Redis key where to put and get messages. 20 | serializer : object 21 | Serializer object with ``loads`` and ``dumps`` methods. 22 | 23 | """ 24 | if serializer is None: 25 | # Backward compatibility. 26 | # TODO: deprecate pickle. 27 | serializer = picklecompat 28 | if not hasattr(serializer, 'loads'): 29 | raise TypeError("serializer does not implement 'loads' function: %r" 30 | % serializer) 31 | if not hasattr(serializer, 'dumps'): 32 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 | % serializer) 34 | 35 | self.server = server 36 | self.spider = spider 37 | self.key = key % {'spider': spider.name} 38 | self.serializer = serializer 39 | 40 | def _encode_request(self, request): 41 | """Encode a request object""" 42 | obj = request_to_dict(request, self.spider) 43 | return self.serializer.dumps(obj) 44 | 45 | def _decode_request(self, encoded_request): 46 | """Decode an request previously encoded""" 47 | obj = self.serializer.loads(encoded_request) 48 | return request_from_dict(obj, self.spider) 49 | 50 | def __len__(self): 51 | """Return the length of the queue""" 52 | raise NotImplementedError 53 | 54 | def push(self, request): 55 | """Push a request""" 56 | raise NotImplementedError 57 | 58 | def pop(self, timeout=0): 59 | """Pop a request""" 60 | raise NotImplementedError 61 | 62 | def clear(self): 63 | """Clear queue/stack""" 64 | self.server.delete(self.key) 65 | 66 | 67 | class FifoQueue(Base): 68 | """Per-spider FIFO queue""" 69 | 70 | def __len__(self): 71 | """Return the length of the queue""" 72 | return self.server.llen(self.key) 73 | 74 | def push(self, request): 75 | """Push a request""" 76 | self.server.lpush(self.key, self._encode_request(request)) 77 | 78 | def pop(self, timeout=0): 79 | """Pop a request""" 80 | if timeout > 0: 81 | data = self.server.brpop(self.key, timeout) 82 | if isinstance(data, tuple): 83 | data = data[1] 84 | else: 85 | data = self.server.rpop(self.key) 86 | if data: 87 | return self._decode_request(data) 88 | 89 | 90 | class PriorityQueue(Base): 91 | """Per-spider priority queue abstraction using redis' sorted set""" 92 | 93 | def __len__(self): 94 | """Return the length of the queue""" 95 | return self.server.zcard(self.key) 96 | 97 | def push(self, request): 98 | """Push a request""" 99 | data = self._encode_request(request) 100 | score = -request.priority 101 | # We don't use zadd method as the order of arguments change depending on 102 | # whether the class is Redis or StrictRedis, and the option of using 103 | # kwargs only accepts strings, not bytes. 104 | self.server.execute_command('ZADD', self.key, score, data) 105 | 106 | def pop(self, timeout=0): 107 | """ 108 | Pop a request 109 | timeout not support in this queue class 110 | """ 111 | # use atomic range/remove using multi/exec 112 | pipe = self.server.pipeline() 113 | pipe.multi() 114 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 115 | results, count = pipe.execute() 116 | if results: 117 | return self._decode_request(results[0]) 118 | 119 | 120 | class LifoQueue(Base): 121 | """Per-spider LIFO queue.""" 122 | 123 | def __len__(self): 124 | """Return the length of the stack""" 125 | return self.server.llen(self.key) 126 | 127 | def push(self, request): 128 | """Push a request""" 129 | self.server.lpush(self.key, self._encode_request(request)) 130 | 131 | def pop(self, timeout=0): 132 | """Pop a request""" 133 | if timeout > 0: 134 | data = self.server.blpop(self.key, timeout) 135 | if isinstance(data, tuple): 136 | data = data[1] 137 | else: 138 | data = self.server.lpop(self.key) 139 | 140 | if data: 141 | return self._decode_request(data) 142 | 143 | 144 | # TODO: Deprecate the use of these names. 145 | SpiderQueue = FifoQueue 146 | SpiderStack = LifoQueue 147 | SpiderPriorityQueue = PriorityQueue 148 | -------------------------------------------------------------------------------- /scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | # TODO: add SCRAPY_JOB support. 10 | class Scheduler(object): 11 | """Redis-based scheduler 12 | 13 | Settings 14 | -------- 15 | SCHEDULER_PERSIST : bool (default: False) 16 | Whether to persist or clear redis queue. 17 | SCHEDULER_FLUSH_ON_START : bool (default: False) 18 | Whether to flush redis queue on start. 19 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 | How many seconds to wait before closing if no message is received. 21 | SCHEDULER_QUEUE_KEY : str 22 | Scheduler redis key. 23 | SCHEDULER_QUEUE_CLASS : str 24 | Scheduler queue class. 25 | SCHEDULER_DUPEFILTER_KEY : str 26 | Scheduler dupefilter redis key. 27 | SCHEDULER_DUPEFILTER_CLASS : str 28 | Scheduler dupefilter class. 29 | SCHEDULER_SERIALIZER : str 30 | Scheduler serializer. 31 | 32 | """ 33 | 34 | def __init__(self, server, 35 | persist=False, 36 | flush_on_start=False, 37 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 | idle_before_close=0, 42 | serializer=None): 43 | """Initialize scheduler. 44 | 45 | Parameters 46 | ---------- 47 | server : Redis 48 | The redis server instance. 49 | persist : bool 50 | Whether to flush requests when closing. Default is False. 51 | flush_on_start : bool 52 | Whether to flush requests on start. Default is False. 53 | queue_key : str 54 | Requests queue key. 55 | queue_cls : str 56 | Importable path to the queue class. 57 | dupefilter_key : str 58 | Duplicates filter key. 59 | dupefilter_cls : str 60 | Importable path to the dupefilter class. 61 | idle_before_close : int 62 | Timeout before giving up. 63 | 64 | """ 65 | if idle_before_close < 0: 66 | raise TypeError("idle_before_close cannot be negative") 67 | 68 | self.server = server 69 | self.persist = persist 70 | self.flush_on_start = flush_on_start 71 | self.queue_key = queue_key 72 | self.queue_cls = queue_cls 73 | self.dupefilter_cls = dupefilter_cls 74 | self.dupefilter_key = dupefilter_key 75 | self.idle_before_close = idle_before_close 76 | self.serializer = serializer 77 | self.stats = None 78 | 79 | def __len__(self): 80 | return len(self.queue) 81 | 82 | @classmethod 83 | def from_settings(cls, settings): 84 | kwargs = { 85 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 | } 89 | 90 | # If these values are missing, it means we want to use the defaults. 91 | optional = { 92 | # TODO: Use custom prefixes for this settings to note that are 93 | # specific to scrapy-redis. 94 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 | # We use the default setting name to keep compatibility. 98 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 | 'serializer': 'SCHEDULER_SERIALIZER', 100 | } 101 | for name, setting_name in optional.items(): 102 | val = settings.get(setting_name) 103 | if val: 104 | kwargs[name] = val 105 | 106 | # Support serializer as a path to a module. 107 | if isinstance(kwargs.get('serializer'), six.string_types): 108 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 | 110 | server = connection.from_settings(settings) 111 | # Ensure the connection is working. 112 | server.ping() 113 | 114 | return cls(server=server, **kwargs) 115 | 116 | @classmethod 117 | def from_crawler(cls, crawler): 118 | instance = cls.from_settings(crawler.settings) 119 | # FIXME: for now, stats are only supported from this constructor 120 | instance.stats = crawler.stats 121 | return instance 122 | 123 | def open(self, spider): 124 | self.spider = spider 125 | 126 | try: 127 | self.queue = load_object(self.queue_cls)( 128 | server=self.server, 129 | spider=spider, 130 | key=self.queue_key % {'spider': spider.name}, 131 | serializer=self.serializer, 132 | ) 133 | except TypeError as e: 134 | raise ValueError("Failed to instantiate queue class '%s': %s", 135 | self.queue_cls, e) 136 | 137 | self.df = load_object(self.dupefilter_cls).from_spider(spider) 138 | 139 | if self.flush_on_start: 140 | self.flush() 141 | # notice if there are requests already in the queue to resume the crawl 142 | if len(self.queue): 143 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 144 | 145 | def close(self, reason): 146 | if not self.persist: 147 | self.flush() 148 | 149 | def flush(self): 150 | self.df.clear() 151 | self.queue.clear() 152 | 153 | def enqueue_request(self, request): 154 | if not request.dont_filter and self.df.request_seen(request): 155 | self.df.log(request, self.spider) 156 | return False 157 | if self.stats: 158 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 159 | self.queue.push(request) 160 | return True 161 | 162 | def next_request(self): 163 | block_pop_timeout = self.idle_before_close 164 | request = self.queue.pop(block_pop_timeout) 165 | if request and self.stats: 166 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 167 | return request 168 | 169 | def has_pending_requests(self): 170 | return len(self) > 0 171 | -------------------------------------------------------------------------------- /scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection, defaults 6 | from .utils import bytes_to_str 7 | 8 | 9 | class RedisMixin(object): 10 | """Mixin class to implement reading urls from a redis queue.""" 11 | redis_key = None 12 | redis_batch_size = None 13 | redis_encoding = None 14 | 15 | # Redis client placeholder. 16 | server = None 17 | 18 | def start_requests(self): 19 | """Returns a batch of start requests from redis.""" 20 | return self.next_requests() 21 | 22 | def setup_redis(self, crawler=None): 23 | """Setup redis connection and idle signal. 24 | 25 | This should be called after the spider has set its crawler object. 26 | """ 27 | if self.server is not None: 28 | return 29 | 30 | if crawler is None: 31 | # We allow optional crawler argument to keep backwards 32 | # compatibility. 33 | # XXX: Raise a deprecation warning. 34 | crawler = getattr(self, 'crawler', None) 35 | 36 | if crawler is None: 37 | raise ValueError("crawler is required") 38 | 39 | settings = crawler.settings 40 | 41 | if self.redis_key is None: 42 | self.redis_key = settings.get( 43 | 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, 44 | ) 45 | 46 | self.redis_key = self.redis_key % {'name': self.name} 47 | 48 | if not self.redis_key.strip(): 49 | raise ValueError("redis_key must not be empty") 50 | 51 | if self.redis_batch_size is None: 52 | # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 53 | self.redis_batch_size = settings.getint( 54 | 'REDIS_START_URLS_BATCH_SIZE', 55 | settings.getint('CONCURRENT_REQUESTS'), 56 | ) 57 | 58 | try: 59 | self.redis_batch_size = int(self.redis_batch_size) 60 | except (TypeError, ValueError): 61 | raise ValueError("redis_batch_size must be an integer") 62 | 63 | if self.redis_encoding is None: 64 | self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) 65 | 66 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 67 | "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 68 | self.__dict__) 69 | 70 | self.server = connection.from_settings(crawler.settings) 71 | # The idle signal is called when the spider has no requests left, 72 | # that's when we will schedule new requests from redis queue 73 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 74 | 75 | def next_requests(self): 76 | """Returns a request to be scheduled or none.""" 77 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) 78 | fetch_one = self.server.spop if use_set else self.server.lpop 79 | # XXX: Do we need to use a timeout here? 80 | found = 0 81 | # TODO: Use redis pipeline execution. 82 | while found < self.redis_batch_size: 83 | data = fetch_one(self.redis_key) 84 | if not data: 85 | # Queue empty. 86 | break 87 | req = self.make_request_from_data(data) 88 | if req: 89 | yield req 90 | found += 1 91 | else: 92 | self.logger.debug("Request not made from data: %r", data) 93 | 94 | if found: 95 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 96 | 97 | def make_request_from_data(self, data): 98 | """Returns a Request instance from data coming from Redis. 99 | 100 | By default, ``data`` is an encoded URL. You can override this method to 101 | provide your own message decoding. 102 | 103 | Parameters 104 | ---------- 105 | data : bytes 106 | Message from redis. 107 | 108 | """ 109 | url = bytes_to_str(data, self.redis_encoding) 110 | return self.make_requests_from_url(url) 111 | 112 | def schedule_next_requests(self): 113 | """Schedules a request if available""" 114 | # TODO: While there is capacity, schedule a batch of redis requests. 115 | for req in self.next_requests(): 116 | self.crawler.engine.crawl(req, spider=self) 117 | 118 | def spider_idle(self): 119 | """Schedules a request if available, otherwise waits.""" 120 | # XXX: Handle a sentinel to close the spider. 121 | self.schedule_next_requests() 122 | raise DontCloseSpider 123 | 124 | 125 | class RedisSpider(RedisMixin, Spider): 126 | """Spider that reads urls from redis queue when idle. 127 | 128 | Attributes 129 | ---------- 130 | redis_key : str (default: REDIS_START_URLS_KEY) 131 | Redis key where to fetch start URLs from.. 132 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 133 | Number of messages to fetch from redis on each attempt. 134 | redis_encoding : str (default: REDIS_ENCODING) 135 | Encoding to use when decoding messages from redis queue. 136 | 137 | Settings 138 | -------- 139 | REDIS_START_URLS_KEY : str (default: ":start_urls") 140 | Default Redis key where to fetch start URLs from.. 141 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 142 | Default number of messages to fetch from redis on each attempt. 143 | REDIS_START_URLS_AS_SET : bool (default: False) 144 | Use SET operations to retrieve messages from the redis queue. If False, 145 | the messages are retrieve using the LPOP command. 146 | REDIS_ENCODING : str (default: "utf-8") 147 | Default encoding to use when decoding messages from redis queue. 148 | 149 | """ 150 | 151 | @classmethod 152 | def from_crawler(self, crawler, *args, **kwargs): 153 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 154 | obj.setup_redis(crawler) 155 | return obj 156 | 157 | 158 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 159 | """Spider that reads urls from redis queue when idle. 160 | 161 | Attributes 162 | ---------- 163 | redis_key : str (default: REDIS_START_URLS_KEY) 164 | Redis key where to fetch start URLs from.. 165 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 166 | Number of messages to fetch from redis on each attempt. 167 | redis_encoding : str (default: REDIS_ENCODING) 168 | Encoding to use when decoding messages from redis queue. 169 | 170 | Settings 171 | -------- 172 | REDIS_START_URLS_KEY : str (default: ":start_urls") 173 | Default Redis key where to fetch start URLs from.. 174 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 175 | Default number of messages to fetch from redis on each attempt. 176 | REDIS_START_URLS_AS_SET : bool (default: True) 177 | Use SET operations to retrieve messages from the redis queue. 178 | REDIS_ENCODING : str (default: "utf-8") 179 | Default encoding to use when decoding messages from redis queue. 180 | 181 | """ 182 | 183 | @classmethod 184 | def from_crawler(self, crawler, *args, **kwargs): 185 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 186 | obj.setup_redis(crawler) 187 | return obj 188 | -------------------------------------------------------------------------------- /scrapy_redis/utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def bytes_to_str(s, encoding='utf-8'): 5 | """Returns a str if a bytes object is given.""" 6 | if six.PY3 and isinstance(s, bytes): 7 | return s.decode(encoding) 8 | return s 9 | --------------------------------------------------------------------------------