├── .gitignore ├── README.md ├── pyloom ├── __init__.py ├── __main__.py ├── buckets.py ├── drivers.py ├── entry.py ├── errors.py ├── lua │ ├── bloom_cas.lua │ ├── bloom_check.lua │ ├── url_add.lua │ └── url_pop.lua ├── proxy.py ├── scheduler.py ├── tasks.py ├── user-agent.json ├── utils.py └── worker.py ├── setup.py └── spiders ├── DouBan250 ├── README.md ├── __init__.py └── configs.py ├── DouBanBooks ├── README.md ├── __init__.py ├── configs.py └── tasks.py ├── LaGou ├── README.md ├── __init__.py ├── configs.py └── tasks.py ├── PinDuoDuo ├── README.md ├── __init__.py ├── configs.py └── tasks.py ├── PinDuoDuoWEB ├── README.md ├── __init__.py ├── configs.py ├── get_anticontent.js └── tasks.py ├── WeiBo ├── README.md ├── __init__.py ├── configs.py └── tasks.py └── Ziroom ├── README.md ├── __init__.py ├── configs.py └── tasks.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | ### JetBrains template 93 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 94 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 95 | 96 | # User-specific stuff: 97 | .idea/ 98 | .DS_Store 99 | 100 | # Sensitive or high-churn files: 101 | .idea/dataSources.ids 102 | .idea/dataSources.xml 103 | .idea/dataSources.local.xml 104 | .idea/sqlDataSources.xml 105 | .idea/dynamic.xml 106 | .idea/uiDesigner.xml 107 | 108 | # Gradle: 109 | .idea/gradle.xml 110 | .idea/libraries 111 | 112 | # Mongo Explorer plugin: 113 | .idea/mongoSettings.xml 114 | 115 | ## File-based project format: 116 | *.iws 117 | 118 | ## Plugin-specific files: 119 | 120 | # IntelliJ 121 | /out/ 122 | 123 | # mpeltonen/sbt-idea plugin 124 | .idea_modules/ 125 | 126 | # JIRA plugin 127 | atlassian-ide-plugin.xml 128 | 129 | # Crashlytics plugin (for Android Studio and IntelliJ) 130 | com_crashlytics_export_strings.xml 131 | crashlytics.properties 132 | crashlytics-build.properties 133 | fabric.properties 134 | 135 | # pytest 136 | .pytest_cache/ 137 | 138 | # pyloom 139 | __dev_*.py 140 | __debugger__/ 141 | logs/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyLoom,爬龙! 2 | 3 | PyLoom想为有价值的网站编写爬虫,让开发者便捷地获取结构化的数据。 4 | 5 | PyLoom由三个部分组成, 6 | 7 | 1. 框架,减少编写、运行、维护爬虫的工作量。 8 | 9 | 2. 爬虫,寻找有价值的目标为其开发爬虫,并维护既有爬虫的可用性。 10 | 11 | 预期19年底,PyLoom将拥有围绕电子商务、房屋租售、社交网络、新闻媒体的数十个爬虫。 12 | 13 | 3. 升级爬虫,对于频繁使用的爬虫,增强其能力 14 | + 增强定制能力,例如支持限定地区、类别、关键字抓取; 15 | + 增强抓取策略,减少对代理、打码接口的使用; 16 | + 增强更新策略,更细粒度地计算重复抓取的时间。 17 | 18 | 目前进度, 19 | 20 | ①部分完成,开发常见爬虫够用了,随爬虫的开发迭代出更多功能; 21 | 22 | ②已有几款爬虫,放置于`spiders`目录。 23 | 24 | 25 | 26 | ## 安装 27 | 28 | 1. **环境要求** 29 | 30 | + python 3.6.0+ 31 | + redis 2.6+ 32 | + 类unix系统 33 | 34 | 2. **安装PyLoom** 35 | 36 | ```bash 37 | git clone https://github.com/spencer404/PyLoom.git 38 | python3.6 -m pip install -e ./PyLoom 39 | ``` 40 | 41 | > 添加 `-i https://pypi.douban.com/simple` 参数,利用豆瓣镜像提速。 42 | 43 | >出现错误`fatal error: Python.h: No such file or directory`时, 44 | > 45 | >需安装对应平台的python3.x-devel包 46 | > 47 | 48 | 49 | 50 | ## 运行 51 | 52 | 以运行`spiders/WeiBo`为例, 53 | 54 | 1. **最简参数启动爬虫** 55 | 56 | ```bash 57 | pyloom run -s PyLoom/spiders/WeiBo 58 | ``` 59 | 60 | >在爬虫目录中执行`run`时,可省略`-s`参数。 61 | 62 | 2. **启动代理池** 63 | 64 | ```bash 65 | pyloom proxy run 66 | ``` 67 | 68 | 3. **添加代理** 69 | 70 | 根据命令提示,添加名为"xxx"的代理 71 | 72 | ```bash 73 | pyloom proxy add 74 | ``` 75 | 76 | 4. **使用代理启动爬虫** 77 | 78 | ```bash 79 | pyloom run --proxy xxx 80 | ``` 81 | 82 | 命令`run`的部分常用参数: 83 | 84 | ```bash 85 | -l, --level 日志级别 86 | -s, --spider 指定爬虫目录 87 | -r, --redis 指定redis地址(URL形式) 88 | -C, --clear 清空队列、代理数据后运行 89 | --proxy 使用指定代理运行,逗号分隔多个代理 90 | --damon 作为守护进程运行 91 | -p 子进程数量 92 | -t 每个子进程的线程数量 93 | ``` 94 | 95 | 在多台服务器上运行时,若参数`-s、-r`所指向的目标相同,即可横向扩容性能。 96 | 97 | 默认地,PyLoom将抓到数据打印在日志中,你可以修改`on_save`函数自定义如何保存。 -------------------------------------------------------------------------------- /pyloom/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import Task 2 | -------------------------------------------------------------------------------- /pyloom/__main__.py: -------------------------------------------------------------------------------- 1 | """作为模块启动""" 2 | if __name__ == '__main__': 3 | from .entry import main 4 | 5 | main() 6 | -------------------------------------------------------------------------------- /pyloom/buckets.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import fnmatch 4 | import threading 5 | from .errors import BucketError 6 | from redis import StrictRedis, exceptions 7 | 8 | 9 | class LocalBucket(object): 10 | """进程内存储,重启后数据丢失""" 11 | _lock = None 12 | _instances = {} 13 | 14 | def __init__(self): 15 | self._db = {} 16 | if LocalBucket._lock is None: 17 | LocalBucket._lock = threading.Lock() 18 | 19 | @classmethod 20 | def instance(cls, name): 21 | """获取单例""" 22 | var = LocalBucket._instances.get(name, None) 23 | if var: 24 | return var 25 | var = LocalBucket() 26 | LocalBucket._instances[name] = var 27 | return var 28 | 29 | @classmethod 30 | def purge(cls): 31 | """清理由instance创建的所有实例的过期key,返回被清理的数量""" 32 | count = 0 33 | for instance in cls._instances.values(): 34 | count += instance._purge() 35 | return count 36 | 37 | def _purge(self): 38 | """清理实例中过期的key,返回被清理的数量""" 39 | keys = [] 40 | for key, (_, expire_at) in self._db.items(): 41 | if expire_at is not None and expire_at <= time.time(): 42 | keys.append(key) 43 | for key in keys: 44 | del self._db[key] 45 | return len(keys) 46 | 47 | def set(self, key, value, ttl=None): 48 | """为key设置value,ttl秒后失效""" 49 | item = self._db.get(key, None) 50 | if item is None or ttl is not None: 51 | # 更改value和ttl 52 | if ttl is None: 53 | expire_at = None 54 | else: 55 | expire_at = time.time() + ttl 56 | self._db[key] = [value, expire_at] 57 | else: 58 | # 只更改ttl 59 | self._db[key][0] = value 60 | 61 | def delete(self, *keys) -> int: 62 | """删除一个或多个key,返回被删除的数量""" 63 | count = 0 64 | for key in keys: 65 | item = self._db.get(key, None) 66 | # 忽略不存在的key 67 | if item is None: 68 | continue 69 | expire_at = item[1] 70 | if expire_at is None or expire_at > time.time(): 71 | del self._db[key] 72 | count += 1 73 | else: # 键已过期,不累加计数器 74 | del self._db[key] 75 | return count 76 | 77 | def get(self, key) -> object: 78 | """返回key的value,当key不存在时返回None""" 79 | item = self._db.get(key, None) 80 | if item is None: 81 | return None 82 | value, expire_at = item 83 | if expire_at is None: 84 | return value 85 | elif expire_at > time.time(): 86 | return value 87 | else: # 键已过期 88 | del self._db[key] 89 | return None 90 | 91 | def getset(self, key, value) -> object: 92 | """为给定key设置新value,返回旧value""" 93 | old_value = self.get(key) 94 | self.set(key, value) 95 | return old_value 96 | 97 | def keys(self, pattern='*') -> list: 98 | """ 99 | 返回满足pattern的所有键 100 | pattern支持通配符:?、*、[] 101 | """ 102 | expired_keys = [] 103 | valid_keys = [] 104 | n = time.time() 105 | for key, (_, expire_at) in self._db.items(): 106 | if expire_at is not None and expire_at <= n: 107 | expired_keys.append(key) 108 | else: 109 | if fnmatch.fnmatch(key, pattern): 110 | valid_keys.append(key) 111 | for key in expired_keys: 112 | del self._db[key] 113 | return valid_keys 114 | 115 | def expire(self, key, ttl) -> bool: 116 | """为给定key设置生存时间,ttl秒后被自动删除""" 117 | item = self._db.get(key, None) 118 | if item is None: 119 | return False 120 | _, expire_at = item 121 | if expire_at is None or expire_at >= time.time(): 122 | self._db[key][1] = ttl + time.time() 123 | return True 124 | else: # 键已过期 125 | del self._db[key] 126 | return False 127 | 128 | def ttl(self, key) -> int: 129 | """ 130 | 返回给定key的剩余生存时间 131 | Returns: 132 | 当key不存在时,返回-2; 133 | 当key存在但没有设置剩余生存时间时,返回-1; 134 | 否则,返回key的剩余生存时间 135 | """ 136 | item = self._db.get(key, None) 137 | if item is None: 138 | return -2 139 | value, expire_at = item 140 | if expire_at is None: 141 | return -1 142 | elif expire_at > time.time(): 143 | return expire_at - time.time() 144 | else: # 键已过期 145 | del self._db[key] 146 | return -2 147 | 148 | def incr(self, key, amount=1) -> int: 149 | """ 150 | 将给定key的值加上amount,返回incr后的值 151 | 若key不存在,key被先初始化为0,再incr 152 | 若value非int型,抛出异常 153 | """ 154 | with LocalBucket._lock: 155 | old_value = self.get(key) 156 | if old_value is None: 157 | self.set(key, 0, None) 158 | old_value = 0 159 | elif not isinstance(old_value, int): 160 | raise BucketError("incr应作用于int型的值") 161 | new_value = old_value + amount 162 | self.set(key, new_value) 163 | return new_value 164 | 165 | 166 | class ShareBucket(object): 167 | """共享存储,利用redis存储,不易失""" 168 | prefix = "bucket" 169 | 170 | def __init__(self, db: StrictRedis, name): 171 | self._db = db 172 | self.name = name 173 | self.key_prefix = f"{self.prefix}:{name}" 174 | 175 | def set(self, key, value, ttl=None): 176 | """为key设置value,ttl秒后失效""" 177 | self._db.set(f"{self.key_prefix}:{key}", json.dumps(value), ex=ttl) 178 | 179 | def delete(self, *keys) -> int: 180 | """删除一个或多个key,返回被删除的数量""" 181 | return self._db.delete(*[f"{self.key_prefix}:{k}" for k in keys]) 182 | 183 | def get(self, key) -> object: 184 | """返回key的value,当key不存在时返回None""" 185 | res = self._db.get(f"{self.key_prefix}:{key}") 186 | if res: 187 | return json.loads(res) 188 | else: 189 | return res 190 | 191 | def getset(self, key, value) -> object: 192 | """为给定key设置新value,返回旧value""" 193 | res = self._db.getset(f"{self.key_prefix}:{key}", value) 194 | if res: 195 | return json.loads(res) 196 | else: 197 | return res 198 | 199 | def keys(self, pattern='*') -> list: 200 | """ 201 | 返回满足pattern的所有键 202 | pattern支持通配符:?、*、[] 203 | """ 204 | p = len(f"{self.key_prefix}:") 205 | res = self._db.keys(f"{self.key_prefix}:{pattern}") 206 | return [r.decode()[p:] for r in res] 207 | 208 | def expire(self, key, ttl) -> bool: 209 | """为给定key设置生存时间,ttl秒后被自动删除""" 210 | return self._db.expire(f"{self.key_prefix}:{key}", ttl) 211 | 212 | def ttl(self, key) -> int: 213 | """ 214 | 返回给定key的剩余生存时间 215 | Returns: 216 | 当key不存在时,返回-2; 217 | 当key存在但没有设置剩余生存时间时,返回-1; 218 | 否则,返回key的剩余生存时间 219 | """ 220 | return self._db.ttl(f"{self.key_prefix}:{key}") 221 | 222 | def incr(self, key, amount=1) -> int: 223 | """ 224 | 将给定key的值加上amount,返回incr后的值 225 | 若key不存在,key被先初始化为0,再incr 226 | 若value非int型,抛出异常 227 | """ 228 | try: 229 | return self._db.incr(f"{self.key_prefix}:{key}", amount) 230 | except exceptions.ResponseError as e: 231 | if e.args[0] == 'value is not an integer or out of range': 232 | raise BucketError("incr应作用于int型的值") 233 | 234 | def lpush(self, key, *values) -> int: 235 | """ 236 | 将一个或多个值value插入到列表key的表头 237 | 返回执行LPUSH命令后,列表的长度。 238 | """ 239 | return self._db.lpush(f"{self.key_prefix}:{key}", *values) 240 | 241 | def lrange(self, key, start, end) -> list: 242 | """ 243 | 返回列表 key 中指定区间内的元素,区间以偏移量 start 和 stop 指定。 244 | 包含指定区间内的元素的list 245 | """ 246 | return self._db.lrange(f"{self.key_prefix}:{key}", start, end) 247 | 248 | def lock(self, key, timeout, **kwargs): 249 | """分布式锁""" 250 | return self._db.lock(f"{self.key_prefix}:{key}", timeout, **kwargs) 251 | -------------------------------------------------------------------------------- /pyloom/drivers.py: -------------------------------------------------------------------------------- 1 | import furl 2 | import time 3 | import logging 4 | import requests 5 | import traceback 6 | from . import utils 7 | 8 | logger = logging.getLogger("drivers") 9 | 10 | 11 | class ProxyDriver(object): 12 | """代理驱动的基类,必须继承此类,否则驱动不能被识别""" 13 | 14 | def __init__(self, **kwargs): 15 | """在代理启动时传入自定义参数""" 16 | self.url = kwargs['url'] 17 | self.interval = kwargs['interval'] 18 | self.parallel = kwargs['parallel'] 19 | 20 | @classmethod 21 | def get_params(cls): 22 | """获取自定义参数""" 23 | template = [ 24 | { 25 | 'name': 'url', 26 | 'title': '代理提取接口?', 27 | 'example': 'http://api.example.com', 28 | 'regex': 'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 29 | }, 30 | { 31 | 'name': 'interval', 32 | 'title': '每隔多少秒调用一次接口?', 33 | 'type': int, 34 | 'note': "0-无间隔" 35 | }, 36 | { 37 | 'name': 'parallel', 38 | 'title': '每个代理能被多少线程并发使用?', 39 | 'type': int 40 | } 41 | ] 42 | return utils.template_input(template) 43 | 44 | def gen_addresses(self): 45 | """ 46 | 返回一个生成器,每次迭代时返回一个代理,其格式为: 47 | valid_at:expire_at:address 48 | valid_at : 当前时间大于valid_at时代理可用 49 | expire_at: 当前时间小于expire_at时代理可用,大于expire_at时被删除 50 | address : 代理地址,支持http、https、socks5协议 51 | """ 52 | raise NotImplementedError 53 | 54 | 55 | class MoGuProxy(ProxyDriver): 56 | title = '蘑菇API代理' 57 | 58 | def gen_addresses(self): 59 | logger.info("代理已启动", self.title, self.url) 60 | while True: 61 | try: 62 | time.sleep(self.interval / 2) # 接口故障时睡眠一半时间 63 | try: 64 | resp = requests.get(self.url, timeout=1) 65 | except Exception as e: 66 | yield False, f"接口请求异常:{e}" 67 | continue 68 | 69 | if resp.status_code != 200: 70 | yield False, f"接口状态码异常:{resp.status_code}" 71 | continue 72 | 73 | try: 74 | data = resp.json() 75 | except Exception: 76 | yield False, f"接口返回值非JSON格式" 77 | continue 78 | 79 | if int(data.get('code', -1)) != 0: 80 | yield False, f'接口返回异常:{data.get("msg", "unknown")}' 81 | continue 82 | 83 | expire_at = time.time() + 600 84 | addresses = [f"0:{expire_at}:http://{i['ip']}:{i['port']}" for i in data.get('msg', [])] 85 | yield True, addresses * self.parallel 86 | time.sleep(self.interval / 2) 87 | except Exception as e: 88 | logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc()) 89 | 90 | 91 | class MiPuProxy(ProxyDriver): 92 | title = "米扑开放代理" 93 | 94 | def gen_addresses(self): 95 | logger.info("代理已启动", self.title, self.url) 96 | while True: 97 | try: 98 | url = furl.furl(self.url) 99 | url.query.params.set('result_format', 'json') 100 | time.sleep(self.interval / 2) # 接口故障时睡眠一半时间 101 | try: 102 | resp = requests.get(url, timeout=1) 103 | except Exception as e: 104 | yield False, f"接口请求异常:{e}" 105 | continue 106 | 107 | if resp.status_code != 200: 108 | yield False, f"接口状态码异常:{resp.status_code}" 109 | continue 110 | 111 | try: 112 | data = resp.json() 113 | except Exception: 114 | yield False, f"接口返回值非JSON格式" 115 | continue 116 | 117 | if int(data.get('code', -1)) != 0: 118 | yield False, f'接口返回异常:{data.get("msg", "unknown")}' 119 | continue 120 | 121 | expire_at = time.time() + 60 * 60 * 24 * 30 * 12 # 有效期一年 122 | addresses = [] 123 | for item in data.get('result', []): 124 | scheme = item['http_type'].lower() 125 | server = item['ip:port'] 126 | addresses.append(f"0:{expire_at}:{scheme}://{server}") 127 | 128 | yield True, addresses * self.parallel 129 | time.sleep(self.interval / 2) 130 | except Exception as e: 131 | logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc()) 132 | -------------------------------------------------------------------------------- /pyloom/entry.py: -------------------------------------------------------------------------------- 1 | """ 2 | 程序入口 3 | 解析命令行参数、配置文件参数,启动对应模块 4 | 所有有关参数解析的操作应当在这里完成 5 | """ 6 | import json 7 | import redis 8 | import daemon 9 | import signal 10 | import psutil 11 | import datetime 12 | import argparse 13 | import daemon.pidfile 14 | from .utils import * 15 | from .errors import * 16 | from tabulate import tabulate 17 | from .scheduler import Spider, Queue 18 | from . import drivers, worker, proxy, tasks 19 | 20 | logger = logging.getLogger("entry") 21 | 22 | 23 | def set_defaults(options): 24 | """设置默认值""" 25 | # 设置日志 26 | if hasattr(options, 'log'): 27 | if options.log: 28 | options.log = os.path.abspath(os.path.expanduser(options.log)) 29 | else: 30 | root_path = os.path.dirname(os.path.dirname(__file__)) 31 | options.log = os.path.join(root_path, 'logs') 32 | os.makedirs(options.log, exist_ok=True) 33 | logging.getLogger("requests").setLevel(logging.WARNING) 34 | patch_logger_format() 35 | if hasattr(options, 'level'): 36 | logging.basicConfig(level=options.level.upper()) 37 | # 设置爬虫目录 38 | if hasattr(options, 'spider'): 39 | options.spider = os.path.abspath(os.path.expanduser(options.spider)) 40 | setattr(options, 'name', os.path.basename(options.spider)) 41 | 42 | 43 | def set_console_logger(): 44 | """设置在控制台中输出日志""" 45 | fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s' 46 | date_fmt = '%y%m%d %H:%M:%S' 47 | formatter = logging.Formatter(fmt, date_fmt) 48 | handler = logging.StreamHandler() 49 | handler.setFormatter(formatter) 50 | patch_handler_color(handler) 51 | logging.root.handlers = [handler] 52 | 53 | 54 | def set_file_logger(options, filename): 55 | """ 56 | 设置使用文件记录日志 57 | 需在DaemonContext中调用此函数,否则DaemonContext会关闭日志文件导致启动失败 58 | """ 59 | fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s' 60 | date_fmt = '%y%m%d %H:%M:%S' 61 | formatter = logging.Formatter(fmt, date_fmt) 62 | handler = TimedRotatingFileHandler( 63 | filename=os.path.join(options.log, filename), 64 | backupCount=options.backup, 65 | when="MIDNIGHT" 66 | ) 67 | handler.setFormatter(formatter) 68 | logging.root.handlers = [handler] 69 | 70 | 71 | def handler_common_stop(options, pid_name): 72 | """停止指定进程""" 73 | pidfile = os.path.join(options.log, pid_name) 74 | if not os.path.exists(pidfile): 75 | return "后台进程未启动" 76 | with open(pidfile) as f: 77 | pid = int(f.read()) 78 | if pid: 79 | os.kill(pid, signal.SIGINT) 80 | print(f"已发出信号,等待进程退出,pid={pid}") 81 | # 等待进程退出 82 | for _ in range(32): 83 | if not psutil.pid_exists(pid): 84 | return "OK" 85 | time.sleep(1) 86 | else: 87 | return f"ERR: 进程超时未退出,pid={pid}" 88 | else: 89 | return "OK" 90 | 91 | 92 | def handler_common_tail(options, filename): 93 | """查看指定进程的日志""" 94 | logfile = os.path.join(options.log, filename) 95 | if not os.path.exists(logfile): 96 | return "没有日志" 97 | for line in tail(logfile): 98 | print(line, end='') 99 | 100 | 101 | def parse_args(args): 102 | """ 103 | 从字符串中解析出多个参数 104 | 105 | >>> parse_args(" a,b,c, ") 106 | ['a', 'b', 'c'] 107 | """ 108 | if not args: 109 | return [] 110 | args = args.replace(",", ",") 111 | return [a.strip() for a in args.split(",") if a.strip()] 112 | 113 | 114 | def handler_proxy_run(options): 115 | """启动代理池节点""" 116 | if options.damon: 117 | pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, 'proxy.pid')) 118 | if pidfile.is_locked(): 119 | pid = pidfile.read_pid() 120 | if psutil.pid_exists(pid): 121 | return f"已有实例正在运行,pid={pid}" 122 | else: 123 | pidfile.break_lock() 124 | print("OK") 125 | with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr): 126 | set_file_logger(options, "proxy") 127 | return proxy.start(options.redis) 128 | else: 129 | return proxy.start(options.redis) 130 | 131 | 132 | def handler_proxy_add(options): 133 | """添加代理""" 134 | db = redis.StrictRedis.from_url(options.redis) 135 | # 扫描所有驱动 136 | driver_name_to_title = {} 137 | for driver_name, var in vars(drivers).items(): 138 | try: 139 | if issubclass(var, drivers.ProxyDriver) \ 140 | and var is not drivers.ProxyDriver \ 141 | and hasattr(var, 'title'): 142 | driver_name_to_title[driver_name] = getattr(var, 'title') 143 | except TypeError: 144 | pass 145 | if not driver_name_to_title: 146 | return "ERR: 无可用驱动" 147 | drivers_names = list(driver_name_to_title.items()) 148 | # 询问用户,选择驱动 149 | print("请选择代理驱动 (填写序号或英文名称)") 150 | print('\n'.join([f"{i}. {k}, {v}" for i, (k, v) in enumerate(drivers_names)])) 151 | s = input('➜ ') 152 | driver_name = driver_name_to_title.get(s) and s 153 | if driver_name is None: 154 | try: 155 | driver_name = drivers_names[int(s)][0] 156 | except (ValueError, KeyError, IndexError): 157 | return "ERR: 序号或名称错误" 158 | print("当前驱动为: ", driver_name) 159 | driver_cls = getattr(drivers, driver_name) 160 | # 询问配置 161 | proxy_name = template_input([{ 162 | "name": "name", 163 | "title": "请为当前配置设置独一无二的名称" 164 | }])['name'] 165 | proxy_params = driver_cls.get_params() 166 | # 检查配置名是否重复 167 | if db.hexists("proxy:configs", proxy_name): 168 | s = input(f"配置'{proxy_name}'已存在,是否覆盖 (Y/N) ") 169 | if s.upper() != 'Y': 170 | return 'Bye~' 171 | # 写入配置 172 | proxy_params['version'] = int(time.time()) 173 | proxy_params['driver'] = driver_cls.__name__ 174 | db.hset("proxy:configs", proxy_name, json.dumps(proxy_params)) 175 | return 'OK' 176 | 177 | 178 | def handler_proxy_remove(options): 179 | """删除代理""" 180 | db = redis.StrictRedis.from_url(options.redis) 181 | 182 | if options.name == 'all': 183 | count = db.delete("proxy:configs", *db.keys("proxy:addresses:*")) 184 | else: 185 | count = db.hdel("proxy:configs", options.name) 186 | count += db.delete(f"proxy:addresses:{options.name}") 187 | if count: 188 | return 'OK' 189 | else: 190 | return '没有代理' 191 | 192 | 193 | def handler_proxy_list(options): 194 | """列出所有代理""" 195 | db = redis.StrictRedis.from_url(options.redis) 196 | 197 | configs = db.hgetall("proxy:configs") 198 | if not configs: 199 | return "没有代理" 200 | configs = {k.decode(): json.loads(v) for k, v in configs.items()} 201 | data = [(k, v['driver']) for k, v in configs.items()] 202 | headers = ['配置名', '驱动'] 203 | return tabulate(data, headers, 'presto', showindex='always') 204 | 205 | 206 | def handler_run(options): 207 | """运行爬虫""" 208 | db = redis.StrictRedis.from_url(options.redis) 209 | spider_configs = load_spider_configs(options.spider) 210 | 211 | proxies = parse_args(options.proxy) 212 | if proxies: 213 | for proxy_name in proxies: 214 | if not db.hexists("proxy:configs", proxy_name): 215 | return f"ERR: 未找到代理'{proxy_name}'" 216 | logger.info("使用代理运行", proxies) 217 | 218 | if not os.path.exists(os.path.join(options.spider, '__init__.py')): 219 | return "ERR: 未找到爬虫入口:'__init__.py'" 220 | 221 | if options.clear: 222 | logger.info("清空队列与代理数据") 223 | Spider(db, options.name).clear_proxy() 224 | Spider(db, options.name).clear_queue() 225 | 226 | if options.damon: 227 | pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, f'{options.name}.pid')) 228 | if pidfile.is_locked(): 229 | pid = pidfile.read_pid() 230 | if psutil.pid_exists(pid): 231 | return f"已有实例正在运行,pid={pid}" 232 | else: 233 | pidfile.break_lock() 234 | logger.info("转入后台运行") 235 | with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr): 236 | set_file_logger(options, options.name) 237 | return worker.start( 238 | options.spider, options.redis, spider_configs, 239 | proxies, options.processes, options.threads 240 | ) 241 | else: 242 | return worker.start( 243 | options.spider, options.redis, spider_configs, 244 | proxies, options.processes, options.threads 245 | ) 246 | 247 | 248 | def handler_remove(options): 249 | """清除数据""" 250 | db = redis.StrictRedis(options.redis) 251 | spider = Spider(db, options.name) 252 | if options.target == 'queue': 253 | count = spider.clear_queue() 254 | return f"已清除{count}条队列数据" 255 | elif options.target == 'proxy': 256 | count = spider.clear_proxy() 257 | if count: 258 | return "已清除代理数据" 259 | else: 260 | return "没有代理数据" 261 | else: 262 | return f"无法清理:{options.target}" 263 | 264 | 265 | def handler_top(options): 266 | """查看统计""" 267 | db = redis.StrictRedis.from_url(options.redis) 268 | tracking = tasks.Tracking(options.name, db) 269 | lasts = {field: tracking.get(field) for field in sorted(tracking.fields)} 270 | try: 271 | while True: 272 | print(f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', end=' ') 273 | count = db.llen(f"proxy:addresses:{options.name}") 274 | print(f'proxy:{count}', end='; ') 275 | fields = sorted(tracking.fields) 276 | for field in fields: 277 | last = lasts.get(field, None) 278 | current = tracking.get(field) 279 | lasts[field] = current 280 | if last is None: 281 | print(f"{field}:None", end='; ') 282 | else: 283 | print(f"{field}:{round((current-last)/options.interval, 1)}", end='; ') 284 | print(end='\n') 285 | time.sleep(options.interval) 286 | except KeyboardInterrupt: 287 | return 288 | 289 | 290 | def handler_tag_list(options): 291 | """查看所有异常标签""" 292 | db = redis.StrictRedis.from_url(options.redis) 293 | if not Spider(db, options.name).exists(): 294 | return "爬虫不存在" 295 | queue = Queue(db, options.name) 296 | if options.tag: 297 | data = [(d,) for d in queue.get_errors(options.tag, 0)] 298 | return tabulate(data, ['URL'], 'presto', showindex='always') 299 | else: 300 | tags = queue.tags 301 | if not tags: 302 | return "没有标签" 303 | else: 304 | data = sorted(tags.items(), key=lambda t: t[1], reverse=True) 305 | headers = ['标签', '数量'] 306 | return tabulate(data, headers, 'presto', showindex='always') 307 | 308 | 309 | def handler_tag_remove(options): 310 | """移除异常标签""" 311 | db = redis.StrictRedis.from_url(options.redis) 312 | if not Spider(db, options.name).exists(): 313 | return "爬虫不存在" 314 | queue = Queue(db, options.name) 315 | if options.tags == 'all': 316 | tags = queue.tags 317 | else: 318 | tags = parse_args(options.tags) 319 | if not tags: 320 | return '没有标签' 321 | for tag in tags: 322 | if queue.remove_tag(tag): 323 | print(f"已删除标签'{tag}'") 324 | else: 325 | print(f"未找到标签'{tag}'") 326 | return "OK" 327 | 328 | 329 | def handler_tag_rollback(options): 330 | """回滚异常标签下的所有任务""" 331 | db = redis.StrictRedis.from_url(options.redis) 332 | if not Spider(db, options.name).exists(): 333 | return "爬虫不存在" 334 | queue = Queue(db, options.name) 335 | if options.tags == 'all': 336 | tags = queue.tags 337 | else: 338 | tags = parse_args(options.tags) 339 | if tags: 340 | for tag in tags: 341 | count = queue.rollback_tag(tag, 0) 342 | return f"回滚'{tag}', 数量:{count}, 队列优先级:0" 343 | else: 344 | return "未指定标签" 345 | 346 | 347 | def main(): 348 | # parents 349 | log = argparse.ArgumentParser(add_help=False) 350 | log.add_argument('-l', '--level', default='info', help='日志级别') 351 | log.add_argument('--log', help='存放日志文件的目录') 352 | log.add_argument('--backup', type=int, default=3, help='日志文件保留数量') 353 | spider = argparse.ArgumentParser(add_help=False) 354 | spider.add_argument('-s', '--spider', default='./', help='指定爬虫目录') 355 | db = argparse.ArgumentParser(add_help=False) 356 | db.add_argument('-r', '--redis', default='redis://127.0.0.1:6379/0', help='指定redis地址') 357 | 358 | # pyloom 359 | node = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 360 | node.set_defaults(module=None) 361 | node_modules = node.add_subparsers() 362 | 363 | # pyloom run 364 | node_run = node_modules.add_parser('run', help='运行爬虫', parents=[log, spider, db]) 365 | node_run.set_defaults(module='run') 366 | node_run.add_argument('-C', '--clear', action="store_true", help='清空爬虫数据后运行') 367 | node_run.add_argument('--proxy', help='使用指定代理运行,逗号分隔多个代理') 368 | node_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行') 369 | node_run.add_argument('-p', '--processes', default=2, type=int, help='子进程数量') 370 | node_run.add_argument('-t', '--threads', default=40, type=int, help='每个子进程的线程数量') 371 | # pyloom stop 372 | node_stop = node_modules.add_parser('stop', help='停止后台运行的爬虫', parents=[spider]) 373 | node_stop.set_defaults(module='stop') 374 | # pyloom tail 375 | node_tail = node_modules.add_parser('tail', help='查看日志文件', parents=[log, spider]) 376 | node_tail.set_defaults(module='tail') 377 | # pyloom top 378 | node_top = node_modules.add_parser('top', help='查看统计', parents=[spider, db]) 379 | node_top.set_defaults(module='top') 380 | node_top.add_argument('-i', '--interval', default=10, type=int, help='抽样间隔') 381 | # pyloom remove 382 | node_remove = node_modules.add_parser('remove', help='清除爬虫数据') 383 | node_remove.set_defaults(module='remove') 384 | node_remove.set_defaults(target=None) 385 | node_remove_targets = node_remove.add_subparsers() 386 | # pyloom remove queue 387 | node_remove_queue = node_remove_targets.add_parser('queue', help='清除队列数据', parents=[spider, db]) 388 | node_remove_queue.set_defaults(target='queue') 389 | # pyloom remove proxy 390 | node_remove_proxy = node_remove_targets.add_parser('proxy', help='清空代理池', parents=[spider, db]) 391 | node_remove_proxy.set_defaults(target='proxy') 392 | # pyloom tag 393 | node_tag = node_modules.add_parser('tag', help='标签管理') 394 | node_tag.set_defaults(module='tag') 395 | node_tag.set_defaults(command=None) 396 | node_tag_commands = node_tag.add_subparsers() 397 | # pyloom tag list 398 | node_tag_list = node_tag_commands.add_parser('list', help='查看标签', parents=[spider, db]) 399 | node_tag_list.set_defaults(command='list') 400 | node_tag_list.add_argument('tag', nargs='?', help='列出指定标签的内容,留空显示标签列表') 401 | # pyloom tag remove 402 | node_tag_remove = node_tag_commands.add_parser('remove', help='清除标签', parents=[spider, db]) 403 | node_tag_remove.set_defaults(command='remove') 404 | node_tag_remove.add_argument('tags', help='被清除的标签,逗号分隔多个标签') 405 | # pyloom rollback :tag 406 | node_tag_rollback = node_tag_commands.add_parser('rollback', help='回滚标签', parents=[spider, db]) 407 | node_tag_rollback.set_defaults(command='rollback') 408 | node_tag_rollback.add_argument('tags', help='被回滚的标签,逗号分隔多个标签') 409 | 410 | # pyloom proxy 411 | node_proxy = node_modules.add_parser('proxy', help='代理节点') 412 | node_proxy.set_defaults(module='proxy') 413 | node_proxy.set_defaults(command=None) 414 | node_proxy_commands = node_proxy.add_subparsers() 415 | # pyloom proxy run 416 | node_proxy_run = node_proxy_commands.add_parser('run', help='启动代理节点', parents=[log, db]) 417 | node_proxy_run.set_defaults(command='run') 418 | node_proxy_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行') 419 | # pyloom proxy stop 420 | node_proxy_stop = node_proxy_commands.add_parser('stop', help='停止节点', parents=[log]) 421 | node_proxy_stop.set_defaults(command='stop') 422 | # pyloom proxy tail 423 | node_proxy_tail = node_proxy_commands.add_parser('tail', help='查看日志', parents=[log]) 424 | node_proxy_tail.set_defaults(command='tail') 425 | # pyloom proxy add 426 | node_proxy_add = node_proxy_commands.add_parser('add', help='添加代理', parents=[db]) 427 | node_proxy_add.set_defaults(command='add') 428 | # pyloom proxy remove 429 | node_proxy_remove = node_proxy_commands.add_parser('remove', help='删除指定代理', parents=[db]) 430 | node_proxy_remove.set_defaults(command='remove') 431 | node_proxy_remove.add_argument('name', help='欲删除的代理名称,all表示所有代理') 432 | # pyloom proxy list 433 | node_proxy_list = node_proxy_commands.add_parser('list', help='列出所有配置', parents=[db]) 434 | node_proxy_list.set_defaults(command='list') 435 | 436 | # 路由至对应模块 437 | options = node.parse_args() 438 | try: 439 | set_defaults(options) 440 | set_console_logger() 441 | if options.module == 'proxy': 442 | if options.command == 'run': 443 | return handler_proxy_run(options) 444 | elif options.command == 'stop': 445 | return handler_common_stop(options, 'proxy.pid') 446 | elif options.command == 'tail': 447 | return handler_common_tail(options, 'proxy') 448 | elif options.command == 'add': 449 | return handler_proxy_add(options) 450 | elif options.command == 'remove': 451 | return handler_proxy_remove(options) 452 | elif options.command == 'list': 453 | return handler_proxy_list(options) 454 | else: 455 | return node_proxy.print_help() 456 | elif options.module == 'run': 457 | return handler_run(options) 458 | elif options.module == 'stop': 459 | return handler_common_stop(options, f'{options.name}.pid') 460 | elif options.module == 'remove': 461 | return handler_remove(options) 462 | elif options.module == 'top': 463 | return handler_top(options) 464 | elif options.module == 'tail': 465 | return handler_common_tail(options, options.name) 466 | elif options.module == 'tag': 467 | if options.command == 'list': 468 | return handler_tag_list(options) 469 | elif options.command == 'remove': 470 | return handler_tag_remove(options) 471 | elif options.command == 'rollback': 472 | return handler_tag_rollback(options) 473 | else: 474 | return node_tag.print_help() 475 | else: 476 | return node.print_help() 477 | except ConfigFileNotFoundError as e: 478 | return f'ERR: {str(e)}' 479 | -------------------------------------------------------------------------------- /pyloom/errors.py: -------------------------------------------------------------------------------- 1 | class TaskError(Exception): 2 | """ 3 | Task生命周期中出现的异常 4 | 若某Task抛出此异常,当前URL将被加入异常队列(error) 5 | """ 6 | 7 | def __init__(self, tag): 8 | self.err = f"TaskError('{tag}')" 9 | 10 | def __str__(self): 11 | return self.err 12 | 13 | 14 | class TaskFinish(Exception): 15 | """提前结束生命周期,并将当前URL加入布隆过滤器""" 16 | 17 | 18 | class TaskBreak(Exception): 19 | """提前结束生命周期,并将当前URL归还到任务队列""" 20 | 21 | def __init__(self, priority=0): 22 | """ 23 | Args: 24 | priority: 指定队列优先级 25 | """ 26 | self.priority = priority 27 | 28 | 29 | class RetryExceeded(TaskError): 30 | """重试次数超限""" 31 | 32 | def __init__(self): 33 | self.err = "RetryExceeded" 34 | 35 | 36 | class RequestError(Exception): 37 | """请求异常""" 38 | 39 | 40 | class Timeout(RequestError): 41 | """请求超时""" 42 | 43 | 44 | class ProxyError(Exception): 45 | """代理异常""" 46 | 47 | 48 | class RetryError(Exception): 49 | """重试错误,需搭配tasks.retry装饰器使用""" 50 | 51 | 52 | class JSONDecodeError(ValueError): 53 | """使用Response().json时出现解码错误""" 54 | 55 | 56 | class DebuggerError(Exception): 57 | pass 58 | 59 | 60 | class SchedulerError(Exception): 61 | pass 62 | 63 | 64 | class ConfigError(Exception): 65 | def __init__(self, name, err=None): 66 | self.name = name 67 | self.err = err 68 | 69 | def __str__(self): 70 | s = f"配置'{self.name}'有误" 71 | if self.err is not None: 72 | s += f", {self.err}" 73 | return s 74 | 75 | 76 | class ConfigFileNotFoundError(ConfigError, FileNotFoundError): 77 | def __init__(self, file): 78 | self.file = file 79 | 80 | def __str__(self): 81 | return f"未找到配置文件:'{self.file}'" 82 | 83 | 84 | class ConfigNotNone(ConfigError, ValueError): 85 | def __init__(self, name): 86 | self.name = name 87 | 88 | def __str__(self): 89 | return f"缺少配置项:'{self.name}'" 90 | 91 | 92 | class BucketError(Exception): 93 | pass 94 | -------------------------------------------------------------------------------- /pyloom/lua/bloom_cas.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2017 Erik Dubbelboer 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | ]] 24 | 25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改 26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter 27 | 28 | local bloom_cas = function(name, entries, precision, value) 29 | local hash = redis.sha1hex(value) 30 | local prefix = "queue:" .. name .. ":filter:bloom" 31 | local countkey = prefix .. ':count' 32 | local count = redis.call('GET', countkey) 33 | if not count then 34 | count = 1 35 | else 36 | count = count + 1 37 | end 38 | 39 | local factor = math.ceil((entries + count) / entries) 40 | -- 0.69314718055995 = ln(2) 41 | local index = math.ceil(math.log(factor) / 0.69314718055995) 42 | local scale = math.pow(2, index - 1) * entries 43 | 44 | -- This uses a variation on: 45 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter' 46 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 47 | local h = {} 48 | h[0] = tonumber(string.sub(hash, 1, 8), 16) 49 | h[1] = tonumber(string.sub(hash, 9, 16), 16) 50 | h[2] = tonumber(string.sub(hash, 17, 24), 16) 51 | h[3] = tonumber(string.sub(hash, 25, 32), 16) 52 | 53 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 54 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127 55 | -- 0.4804530139182 = ln(2)^2 56 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182) 57 | 58 | -- 0.69314718055995 = ln(2) 59 | local maxk = math.floor(0.69314718055995 * maxbits / scale) 60 | local b = {} 61 | 62 | for i = 1, maxk do 63 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)]) 64 | end 65 | 66 | -- Only do this if we have data already. 67 | if index > 1 then 68 | -- The last fiter will be handled below. 69 | for n = 1, index - 1 do 70 | local key = prefix .. ':' .. n 71 | local scale = math.pow(2, n - 1) * entries 72 | 73 | -- 0.4804530139182 = ln(2)^2 74 | local bits = math.floor((scale * math.log(precision * math.pow(0.5, n))) / -0.4804530139182) 75 | 76 | -- 0.69314718055995 = ln(2) 77 | local k = math.floor(0.69314718055995 * bits / scale) 78 | 79 | local found = true 80 | for i = 1, k do 81 | if redis.call('GETBIT', key, b[i] % bits) == 0 then 82 | found = false 83 | break 84 | end 85 | end 86 | 87 | if found then 88 | return 1 89 | end 90 | end 91 | end 92 | 93 | -- For the last filter we do a SETBIT where we check the result value. 94 | local key = prefix .. ':' .. index 95 | 96 | local found = 1 97 | for i = 1, maxk do 98 | if redis.call('SETBIT', key, b[i] % maxbits, 1) == 0 then 99 | found = 0 100 | end 101 | end 102 | 103 | if found == 0 then 104 | -- INCR is a little bit faster than SET. 105 | redis.call('INCR', countkey) 106 | end 107 | 108 | return found 109 | end 110 | 111 | -- 从爬虫配置读取布隆参数 112 | local name = KEYS[1] 113 | local key_spider = "spider:" .. name 114 | local precision = redis.call('HGET', key_spider, 'precision') 115 | if not precision then 116 | return { err = "爬虫未配置'precision'" } 117 | end 118 | 119 | return bloom_cas(name, 1000000, precision, ARGV[1]) 120 | -------------------------------------------------------------------------------- /pyloom/lua/bloom_check.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2017 Erik Dubbelboer 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | ]] 24 | 25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改 26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter 27 | 28 | local bloom_check = function(name, entries, precision, value) 29 | local prefix = "queue:" .. name .. ":filter:bloom" 30 | local count = redis.call('GET', prefix .. ':count') 31 | if not count then 32 | return 0 33 | end 34 | 35 | local factor = math.ceil((entries + count) / entries) 36 | -- 0.69314718055995 = ln(2) 37 | local index = math.ceil(math.log(factor) / 0.69314718055995) 38 | local scale = math.pow(2, index - 1) * entries 39 | local hash = redis.sha1hex(value) 40 | 41 | -- This uses a variation on: 42 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter' 43 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 44 | local h = {} 45 | h[0] = tonumber(string.sub(hash, 1, 8), 16) 46 | h[1] = tonumber(string.sub(hash, 9, 16), 16) 47 | h[2] = tonumber(string.sub(hash, 17, 24), 16) 48 | h[3] = tonumber(string.sub(hash, 25, 32), 16) 49 | 50 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 51 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127 52 | -- 0.4804530139182 = ln(2)^2 53 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182) 54 | -- 0.69314718055995 = ln(2) 55 | local maxk = math.floor(0.69314718055995 * maxbits / scale) 56 | local b = {} 57 | 58 | for i = 1, maxk do 59 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)]) 60 | end 61 | 62 | for n = 1, index do 63 | local key = prefix .. ':' .. n 64 | local found = true 65 | local scalen = math.pow(2, n - 1) * entries 66 | 67 | -- 0.4804530139182 = ln(2)^2 68 | local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182) 69 | 70 | -- 0.69314718055995 = ln(2) 71 | local k = math.floor(0.69314718055995 * bits / scalen) 72 | 73 | for i = 1, k do 74 | if redis.call('GETBIT', key, b[i] % bits) == 0 then 75 | found = false 76 | break 77 | end 78 | end 79 | 80 | if found then 81 | return 1 82 | end 83 | end 84 | 85 | return 0 86 | end 87 | 88 | -- 从爬虫配置中读取布隆参数 89 | local name = KEYS[1] 90 | local key_spider = "spider:" .. name 91 | local precision = redis.call('HGET', key_spider, 'precision') 92 | if not precision then 93 | return { err = "爬虫未配置'precision'" } 94 | end 95 | 96 | return bloom_check(name, 1000000, precision, ARGV[1]) 97 | -------------------------------------------------------------------------------- /pyloom/lua/url_add.lua: -------------------------------------------------------------------------------- 1 | -- 将一组URLs排重后添加至队列 2 | -- Keys: name priority 3 | -- Argv: url [url ...] 4 | -- Return: count,成功数量(不重复数量) 5 | 6 | local bloom_check = function(name, entries, precision, value) 7 | local prefix = "queue:" .. name .. ":filter:bloom" 8 | local count = redis.call('GET', prefix .. ':count') 9 | if not count then 10 | return 0 11 | end 12 | 13 | local factor = math.ceil((entries + count) / entries) 14 | -- 0.69314718055995 = ln(2) 15 | local index = math.ceil(math.log(factor) / 0.69314718055995) 16 | local scale = math.pow(2, index - 1) * entries 17 | local hash = redis.sha1hex(value) 18 | 19 | -- This uses a variation on: 20 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter' 21 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 22 | local h = {} 23 | h[0] = tonumber(string.sub(hash, 1, 8), 16) 24 | h[1] = tonumber(string.sub(hash, 9, 16), 16) 25 | h[2] = tonumber(string.sub(hash, 17, 24), 16) 26 | h[3] = tonumber(string.sub(hash, 25, 32), 16) 27 | 28 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 29 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127 30 | -- 0.4804530139182 = ln(2)^2 31 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182) 32 | -- 0.69314718055995 = ln(2) 33 | local maxk = math.floor(0.69314718055995 * maxbits / scale) 34 | local b = {} 35 | 36 | for i = 1, maxk do 37 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)]) 38 | end 39 | 40 | for n = 1, index do 41 | local key = prefix .. ':' .. n 42 | local found = true 43 | local scalen = math.pow(2, n - 1) * entries 44 | 45 | -- 0.4804530139182 = ln(2)^2 46 | local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182) 47 | 48 | -- 0.69314718055995 = ln(2) 49 | local k = math.floor(0.69314718055995 * bits / scalen) 50 | 51 | for i = 1, k do 52 | if redis.call('GETBIT', key, b[i] % bits) == 0 then 53 | found = false 54 | break 55 | end 56 | end 57 | 58 | if found then 59 | return 1 60 | end 61 | end 62 | 63 | return 0 64 | end 65 | 66 | local name = KEYS[1] 67 | local priority = KEYS[2] 68 | local key_spider = "spider:" .. name 69 | local key_waiting = "queue:" .. name .. ":waiting:" .. priority 70 | local key_filter_queue = "queue:" .. name .. ":filter:queue" 71 | local filter = {} -- 对ARGV排重 72 | local urls = {} -- 将被批量添加到waiting队列的URL 73 | 74 | -- 从爬虫配置中读取布隆参数 75 | local precision = redis.call('HGET', key_spider, 'precision') 76 | if not precision then 77 | return { err = "爬虫未配置'precision'" } 78 | end 79 | 80 | -- 排重 81 | for i = 1, #ARGV do 82 | local url = ARGV[i] 83 | local exists = filter[url] or 84 | bloom_check(name, 1000000, precision, url) == 1 or 85 | redis.call('SISMEMBER', key_filter_queue, url) == 1 86 | if not exists then 87 | filter[url] = true 88 | table.insert(urls, url) 89 | end 90 | end 91 | 92 | -- 加入队列 93 | if #urls == 0 then 94 | return 0 95 | else 96 | redis.call('LPUSH', key_waiting, unpack(urls)) 97 | redis.call('SADD', key_filter_queue, unpack(urls)) 98 | return #urls 99 | end 100 | -------------------------------------------------------------------------------- /pyloom/lua/url_pop.lua: -------------------------------------------------------------------------------- 1 | -- 请求分配任务 2 | -- Keys: now 3 | -- Argv: name [name ...] 4 | -- Return: (url, name, address) 5 | local now = tonumber(KEYS[1]) 6 | for i = 1, #ARGV do 7 | local name = ARGV[i] -- 爬虫名 8 | local key_spider = "spider:" .. name 9 | local status = tonumber(redis.call("HGET", key_spider, "status")) 10 | redis.call("HSET", key_spider, "last_heartbeat_time", now) 11 | -- 条件: 爬虫状态至少为就绪态 12 | if status >= 10 then 13 | local interval = cjson.decode(redis.call("HGET", key_spider, "interval")) 14 | local last_pop_time = cjson.decode(redis.call("HGET", key_spider, "last_pop_time")) 15 | -- 条件: 爬虫已到可用时间 16 | if now >= last_pop_time + interval then 17 | local proxies = cjson.decode(redis.call("HGET", key_spider, "proxies")) 18 | local address = false 19 | -- 爬虫被设置了代理,把代理池弹空也要弹出一个可用代理 20 | if #proxies ~= 0 then 21 | local recycle = {} 22 | while not address do 23 | address = redis.call("RPOP", "proxy:addresses:" .. name) 24 | -- 代理池空了,不再继续弹出 25 | if not address then 26 | break 27 | end 28 | local t1 = string.find(address, ":", 1) 29 | local t2 = string.find(address, ":", t1 + 1) 30 | local valid_at = tonumber(string.sub(address, 1, t1 - 1)) 31 | local expire_at = tonumber(string.sub(address, t1 + 1, t2 - 1)) 32 | 33 | if valid_at > now then 34 | -- 代理未到可用时间,放回去 -> 重新弹出 35 | table.insert(recycle, address) 36 | address = false 37 | else 38 | -- 代理已到可用时间,并未过期 -> 已拿到代理! 39 | if expire_at > now then 40 | break 41 | end 42 | -- 代理已到可用时间,已过期 -> 重新弹出 43 | -- 过期代理不归还 44 | end 45 | end 46 | if #recycle ~= 0 then 47 | redis.call("LPUSH", "proxy:addresses:" .. name, unpack(recycle)) 48 | end 49 | end 50 | -- 条件: 爬虫未设置代理或代理池中可用代理不全为空 51 | if #proxies == 0 or address then 52 | local key_processing = "queue:" .. name .. ":processing" 53 | for priority = 0, 4 do 54 | -- 条件: 爬虫名下所有队列不全为空 55 | local key_waiting = "queue:" .. name .. ":waiting:" .. priority 56 | local url = redis.call("RPOP", key_waiting) 57 | -- 已满足所有条件,发布任务 58 | if url then 59 | -- 加入processing 60 | redis.call("HSET", key_processing, url, now) 61 | -- 更新last_pop_time 62 | redis.call("HSET", key_spider, "last_pop_time", now) 63 | return { url, name, address } 64 | end 65 | end 66 | -- 队列全部为空时,设置爬虫状态为已完成 67 | local processing_len = redis.call("HLEN", key_processing) 68 | if processing_len == 0 then 69 | redis.call("HSET", key_spider, "status", 0) 70 | end 71 | end 72 | end 73 | end 74 | end 75 | 76 | return { false, false, false } 77 | -------------------------------------------------------------------------------- /pyloom/proxy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import redis 3 | import traceback 4 | from .utils import * 5 | from .errors import * 6 | from . import drivers 7 | from .scheduler import Spider 8 | from threading import Thread, Lock 9 | 10 | logger = logging.getLogger("proxy") 11 | 12 | 13 | def proxy_handler(redis_conf, names, name, router, router_lock, driver, **params): 14 | """ 15 | 从驱动获取代理,推送至指定爬虫的代理池中 16 | 每只爬虫有一个代理池,键名为'proxy:proxies:',list型 17 | [address1, address2, address3...] 18 | 其中address为str型,结构为:'valid_at:expire_at:address' 19 | valid_at: 代理生效时间 20 | expire_at: 代理失效时间 21 | address: 代理地址 22 | address在valid_at < now < expire_at可用,并在now > expire_at时被删除 23 | Args: 24 | redis_conf: Redis配置 25 | names: 所有存活代理 26 | name: 当前代理名,通过name in names判断当前代理是否存活 27 | router: 路由表,获取代理后,根据路由推送至代理池 28 | router_lock: router的锁 29 | driver: 驱动类 30 | params: 驱动参数 31 | """ 32 | db = redis.StrictRedis.from_url(redis_conf) 33 | gen = driver(**params).gen_addresses() 34 | for is_ok, result in gen: 35 | with router_lock: 36 | targets = router.get(name, []) 37 | logger.debug("代理正在运行", name, targets) 38 | if not targets: 39 | logger.info("代理退出,router中没有记录", name, driver) 40 | break 41 | if name not in names: 42 | logger.info("代理退出,names中没有记录", name, driver) 43 | break 44 | if is_ok: 45 | for target in targets: 46 | length = db.lpush(f"proxy:addresses:{target}", *result) 47 | logger.info(f"添加代理, 代理:{name},目标:{target}, 新增数量:{len(result)}, 当前数量:{length}\n", result) 48 | else: 49 | time.sleep(1) 50 | else: 51 | logger.warning("代理出现异常", name, result) 52 | 53 | 54 | def get_driver(driver_name): 55 | """获取并检查驱动是否正确""" 56 | if not hasattr(drivers, driver_name): 57 | raise ProxyError("未找到驱动", driver_name) 58 | driver = getattr(drivers, driver_name) 59 | if not issubclass(driver, drivers.ProxyDriver): 60 | raise ProxyError("驱动应继承自ProxyDriver", driver_name) 61 | if not hasattr(driver, 'title'): 62 | raise ProxyError("驱动缺少属性", f"{driver_name}.title") 63 | return driver 64 | 65 | 66 | def start(redis_conf): 67 | """ 68 | 根据代理配置,维护代理线程池 69 | 代理配置为一个redis dict键,键名为proxy:configs,结构为: 70 | { 71 | proxy_name: { 72 | version: str, // 版本号,版本号变化时,代理线程将会重启 73 | driver: str, // 驱动名,对应proxy.py中的类 74 | **params // 驱动参数,将被传递给驱动 75 | } 76 | } 77 | """ 78 | logger.info("代理池已启动") 79 | db = redis.StrictRedis.from_url(redis_conf) 80 | threads = {} # 配置表,{proxy_name: {'version': int, 'thread': Thread}} 81 | router = {} # 路由表,{proxy_name: set([spider_name])} 82 | router_lock = Lock() 83 | for i in itertools.count(): 84 | try: 85 | time.sleep(3 if i else 0) 86 | # 更新路由表,指示代理线程拿到代理后要推给哪些爬虫 87 | _router = {} 88 | for spider_name in Spider.names(db): 89 | spider = Spider(db, spider_name) 90 | if spider.get_field("status") < 10: 91 | logger.debug("忽略未就绪爬虫", spider_name) 92 | continue 93 | last_heartbeat_time = spider.get_field("last_heartbeat_time") 94 | if time.time() - last_heartbeat_time > 300: 95 | logger.debug("忽略长久未运行的爬虫", spider_name) 96 | continue 97 | proxies = Spider(db, spider_name).get_field("proxies") 98 | if not proxies: 99 | logger.debug("忽略未配置代理的爬虫", spider_name) 100 | continue 101 | for proxy_name in proxies: 102 | _router.setdefault(proxy_name, set()).add(spider_name) 103 | with router_lock: 104 | router.clear() 105 | router.update(_router) 106 | 107 | # 标记失效线程 108 | configs = { 109 | k.decode(): json.loads(v) for k, v in db.hgetall("proxy:configs").items() 110 | } 111 | logger.debug("代理配置", configs) 112 | marked_threads = {} # 被标记退出的线程,结构同threads 113 | for proxy_name, fields in threads.items(): 114 | if proxy_name not in configs: 115 | logger.info("标记配置被删的代理", proxy_name) 116 | marked_threads[proxy_name] = fields 117 | continue 118 | if fields['version'] != configs[proxy_name]['version']: 119 | logger.info("标记配置更新的代理", proxy_name) 120 | marked_threads[proxy_name] = fields 121 | continue 122 | if proxy_name not in router: 123 | logger.info("标记已无爬虫的代理", proxy_name) 124 | marked_threads[proxy_name] = fields 125 | continue 126 | if not fields['thread'].is_alive(): 127 | logger.info("标记异常退出的代理", proxy_name) 128 | marked_threads[proxy_name] = fields 129 | continue 130 | 131 | # 销毁被标记的线程 132 | # 线程看见自己没在threads中时会终止 133 | if marked_threads: 134 | for proxy_name in marked_threads.keys(): 135 | del threads[proxy_name] 136 | with router_lock: 137 | if proxy_name in router: 138 | del router[proxy_name] 139 | logger.info("等待被标记代理线程退出", list(marked_threads.keys())) 140 | for _ in range(300): 141 | alive = any([t['thread'].is_alive() for t in marked_threads.values()]) 142 | if not alive: 143 | break 144 | time.sleep(1) 145 | else: 146 | logger.error("被标记代理线程超时仍未退出") 147 | threads.update(marked_threads) 148 | time.sleep(3) 149 | continue 150 | logger.info("被标记代理线程已全部退出") 151 | # 启动新线程 152 | proxy_names_new = set(configs.keys()) - set(threads.keys()) 153 | if proxy_names_new: 154 | for proxy_name in proxy_names_new: 155 | targets = router.get(proxy_name, []) 156 | if not targets: 157 | logger.debug("代理名下没有爬虫,暂不启动", proxy_name) 158 | continue 159 | logger.info("启动代理线程", proxy_name) 160 | version = configs[proxy_name].pop('version') 161 | driver = get_driver(configs[proxy_name].pop('driver')) 162 | t = Thread( 163 | target=proxy_handler, 164 | args=( 165 | redis_conf, threads, proxy_name, router, router_lock, driver 166 | ), 167 | kwargs=(configs[proxy_name]), 168 | daemon=True 169 | ) 170 | threads[proxy_name] = { 171 | 'version': version, 172 | 'thread': t 173 | } 174 | t.start() 175 | except KeyboardInterrupt: 176 | logger.info("收到Ctrl+C", 'proxy') 177 | break 178 | except Exception as e: 179 | logger.fatal("未处理的异常", type(e), e, '\n', traceback.format_exc()) 180 | -------------------------------------------------------------------------------- /pyloom/scheduler.py: -------------------------------------------------------------------------------- 1 | """Scheduler SDK""" 2 | import json 3 | import time 4 | import copy 5 | import random 6 | import logging 7 | from .errors import * 8 | from redis import StrictRedis 9 | from . import utils, tasks, buckets 10 | 11 | key_spiders = "spiders" # set 12 | logger = logging.getLogger("scheduler") 13 | 14 | 15 | class Spider(object): 16 | prefix = "spider" 17 | _caches = {} 18 | _timeout = 10 19 | status = { 20 | 0: '已完成', 21 | 10: '就绪', 22 | 20: '等待代理', # 暂未实现 23 | 21: '等待时间', # 暂未实现 24 | -1: '异常关闭', 25 | -2: '主动关闭' 26 | } 27 | 28 | def __init__(self, db: StrictRedis, name): 29 | self.name = name # 爬虫名 30 | self._db = db 31 | self.key = f"{self.prefix}:{self.name}" # 主键 32 | self.fields = { # 爬虫所有字段及可缓存时间 33 | "interval": 300, 34 | "timeout": 300, 35 | "precision": 10000, 36 | "args": 300, 37 | "last_pop_time": 1, 38 | "status": 1, 39 | "version": 1, 40 | "proxies": 1, 41 | "last_heartbeat_time": 6 42 | } 43 | 44 | def exists(self): 45 | """爬虫是否存在""" 46 | return self._db.exists(self.key) 47 | 48 | def upsert(self, seeders, interval, timeout, precision, args, proxies, version): 49 | """ 50 | 新建爬虫或覆盖同名爬虫的配置(仅当版本号更大时) 51 | Args: 52 | seeders: 种子页面 53 | interval: 最小调度间隔(误差由pop频率决定) 54 | timeout: 任务超时时间 55 | precision: 布隆过滤器精度 56 | args: 自定义爬虫参数 57 | proxies: 使用代理运行 58 | version: 配置版本,等于目录的sha1值 59 | Returns: 60 | T/F: 是否更新了配置 61 | """ 62 | # 当前版本比数据库的还小就不更新了 63 | _version = self._get_field("version") 64 | if _version is not None and version <= _version: 65 | return False 66 | # 忽略更新precision字段 67 | _precision = self._get_field("precision") 68 | if _precision is not None: 69 | precision = _precision 70 | # 爬虫配置 71 | values = { 72 | "interval": interval, 73 | "timeout": timeout, 74 | "precision": precision, 75 | "args": args, 76 | "last_pop_time": 0, 77 | "status": 10, # 0:已完成,10:就绪,20:等待代理,21:等待时间,-1:异常关闭,-2:主动关闭 78 | "version": version, 79 | "proxies": proxies, # 代理配置 80 | "last_heartbeat_time": 0, # 最后一次尝试申请任务的时间 81 | } 82 | self._db.hmset(self.key, {k: json.dumps(v) for k, v in values.items()}) 83 | self._db.sadd(key_spiders, self.name) 84 | # 将种子URL入队 85 | queues = Queue(self._db, self.name) 86 | queues.add(seeders, 0) 87 | return True 88 | 89 | def _get_field(self, field): 90 | """从数据库中查询并返回爬虫的配置项""" 91 | if field not in self.fields: 92 | raise SchedulerError(f"没有此配置项'{field}'") 93 | 94 | res = self._db.hget(self.key, field) 95 | if res is None: 96 | return None 97 | else: 98 | return json.loads(res) 99 | 100 | def get_field(self, field): 101 | """依此从缓存、数据库中查询并返回爬虫的配置项""" 102 | if field not in self.fields: 103 | raise SchedulerError(f"没有此配置项'{field}'") 104 | 105 | timeout = self.fields[field] 106 | # 先尝试从缓存取值 107 | cache_key = f"{self.name}:{field}" 108 | var, start = Spider._caches.get(cache_key, (None, 0)) 109 | if start + timeout < time.time(): 110 | # 缓存过期或无缓存 111 | var = self._get_field(field) 112 | if timeout > 0: 113 | Spider._caches[cache_key] = (var, time.time()) 114 | return var 115 | 116 | def set_field(self, field, value): 117 | """覆写爬虫的配置项""" 118 | if field not in self.fields: 119 | raise SchedulerError(f"没有此配置项'{field}'") 120 | if field == 'precision': 121 | raise SchedulerError(f"配置项被锁定'{field}'") 122 | 123 | self._db.hset(self.key, field, json.dumps(value)) 124 | # 设置缓存 125 | cache_key = f"{self.name}:{field}" 126 | timeout = self.fields[field] 127 | if timeout > 0: 128 | Spider._caches[cache_key] = (value, time.time()) 129 | 130 | @classmethod 131 | def names(cls, db: StrictRedis): 132 | """返回所有爬虫名称的列表""" 133 | return [r.decode() for r in db.smembers(key_spiders)] 134 | 135 | def clear_queue(self): 136 | """清除该爬虫在队列中留存的数据""" 137 | keys = [] 138 | keys += self._db.keys(f"{Spider.prefix}:{self.name}") 139 | keys += self._db.keys(f"{Queue.prefix}:{self.name}:*") 140 | keys += self._db.keys(f"{buckets.ShareBucket.prefix}:{self.name}:*") 141 | keys += self._db.keys(f"{tasks.Tracking.prefix}:{self.name}:*") 142 | return self._db.delete(*keys) if keys else 0 143 | 144 | def clear_proxy(self): 145 | """清空该爬虫的代理池""" 146 | count = self._db.delete(f"proxy:addresses:{self.name}") 147 | count += self._db.srem(key_spiders, self.name) 148 | return count 149 | 150 | 151 | class Queue(object): 152 | prefix = "queue" 153 | 154 | def __init__(self, db: StrictRedis, name): 155 | self.name = name # 爬虫名 156 | self._db = db 157 | self._spider = Spider(db, name) 158 | # 等待队列(list),5个优先级分别用5个list实现,左进右出 159 | # [[url0, url1], [url0, url1], [url0, url1]] 160 | self.key_waiting = [f"{self.prefix}:{self.name}:waiting:{i}" for i in range(5)] 161 | # 进行队列(hash),field=url, value=timestamp 162 | self.key_processing = f"{self.prefix}:{self.name}:processing" 163 | # 异常标签(set) 164 | self.key_tags = f"{self.prefix}:{self.name}:tags" 165 | # 异常队列(list) 166 | self.prefix_error = f"{self.prefix}:{self.name}:errors" # :{tag} 167 | # 队列过滤器(set),过滤waiting、processing、errors中的URl 168 | self.key_filter_queue = f"{self.prefix}:{self.name}:filter:queue" 169 | # 结果过滤器(string or set),过滤已抓取完成的URL 170 | # 结果过滤器有两种实现:set、bloom,通过爬虫配置项'queue.filter'选择适合的实现 171 | self.key_filter_bloom_count = f"{self.prefix}:{self.name}:filter:bloom:count" 172 | 173 | def exists(self, url): 174 | """ 175 | URL是否存在 176 | Returns: 177 | 0: 不存在 178 | 1: 存在于bloom中 179 | 2: 存在于queue中 180 | """ 181 | # 在results中找 182 | sha = utils.RedisScripts.sha1('bloom_check') 183 | if self._db.evalsha(sha, 1, self.name, url): 184 | return 1 185 | # 在queue中找 186 | if self._db.sismember(self.key_filter_queue, url): 187 | return 2 188 | else: 189 | return 0 190 | 191 | def insert(self, url, priority): 192 | """忽略布隆检查,将URL插入至队列中""" 193 | self._db.lpush(self.key_waiting[priority], url) 194 | self._db.sadd(self.key_filter_queue, url) 195 | self._db.hdel(self.key_processing, url) 196 | 197 | def add(self, urls, priority): 198 | """ 199 | URL批量入队 200 | 当URL相同,但priority不同时,也视为重复 201 | Returns: 经排重后添加至队列的数量 202 | """ 203 | if not isinstance(priority, int): 204 | raise SchedulerError("priority应为int型") 205 | if priority < 0 or priority >= len(self.key_waiting): 206 | raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}") 207 | 208 | urls = list(set(urls)) 209 | sha = utils.RedisScripts.sha1('url_add') 210 | return self._db.evalsha(sha, 2, self.name, priority, *urls) 211 | 212 | @classmethod 213 | def pop(cls, db: StrictRedis, names): 214 | """ 215 | 从指定爬虫中弹出一条最合适的URL 216 | Returns: (url, name) 217 | 当所有队列为空时,url == name == None 218 | """ 219 | # 随机挑选爬虫 220 | names = copy.deepcopy(names) 221 | random.shuffle(names) 222 | 223 | sha = utils.RedisScripts.sha1('url_pop') 224 | url, name, address = db.evalsha(sha, 1, time.time(), *names) 225 | if url and name: 226 | return [url.decode(), name.decode(), address.decode() if address else None] 227 | else: 228 | return [None, None, None] 229 | 230 | @classmethod 231 | def purge(cls, db: StrictRedis): 232 | """ 233 | 清理processing中过期的URL,返回被清理数量 234 | 被清理的URL,将被打上"timeout"标签,移入error队列 235 | """ 236 | count = 0 237 | for name in Spider.names(db): 238 | key = f"{cls.prefix}:{name}:processing" 239 | queue = cls(db, name) 240 | timeout = Spider(db, name).get_field("timeout") 241 | # redis的scan是可能重复返回同一元素的 242 | for url, _start in db.hscan_iter(key): 243 | if time.time() > float(_start) + timeout: # 过期 244 | count += queue.report_error("timeout", url) 245 | return count 246 | 247 | def report_finish(self, url): 248 | """标记URL为已完成状态""" 249 | if not self._db.hdel(self.key_processing, url): 250 | return False 251 | self._db.srem(self.key_filter_queue, url) 252 | sha = utils.RedisScripts.sha1('bloom_cas') 253 | logger.debug("report_finish", self.name, url) 254 | return self._db.evalsha(sha, 1, self.name, url) 255 | 256 | def report_error(self, tag, url): 257 | """标记URL为异常状态""" 258 | if not self._db.hdel(self.key_processing, url): 259 | return False 260 | self._db.sadd(self.key_tags, tag) 261 | return self._db.lpush(f"{self.prefix_error}:{tag}", url) 262 | 263 | @property 264 | def tags(self): 265 | """获取标签列表""" 266 | return { 267 | r.decode(): self._db.llen(f"{self.prefix_error}:{r.decode()}") 268 | for r in self._db.smembers(self.key_tags) 269 | } 270 | 271 | def get_errors(self, tag, count=0): 272 | """获取指定标签下的所有异常URL""" 273 | key = f"{self.prefix_error}:{tag}" 274 | return [r.decode() for r in self._db.lrange(key, 0, count - 1)] 275 | 276 | def remove_tag(self, tag): 277 | key = f"{self.prefix_error}:{tag}" 278 | self._db.srem(self.key_tags, tag) 279 | return self._db.delete(key) 280 | 281 | def rollback_tag(self, tag, priority): 282 | """ 283 | 将指定标签下的异常URL移至waiting队列中 284 | 返回回滚的URL数量 285 | """ 286 | if not isinstance(priority, int): 287 | raise SchedulerError("priority应为int型") 288 | if priority < 0 or priority >= len(self.key_waiting): 289 | raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}") 290 | key_errors = f"{self.prefix_error}:{tag}" 291 | # 取出并删除异常URL、标签 292 | pipe = self._db.pipeline() 293 | pipe.lrange(key_errors, 0, -1) # 取出所有 294 | pipe.delete(key_errors) # 删除队列 295 | pipe.srem(self.key_tags, tag) # 删除标签 296 | res = pipe.execute() 297 | # 添加至waiting 298 | urls = res[0] 299 | if urls: 300 | self._db.lpush(self.key_waiting[priority], *urls) 301 | return len(urls) 302 | 303 | @property 304 | def details(self): 305 | """队列信息""" 306 | return { 307 | 'waiting': [self._db.llen(key) for key in self.key_waiting], 308 | 'processing': self._db.hlen(self.key_processing), 309 | 'results': int(self._db.get(self.key_filter_bloom_count) or 0), 310 | 'errors': sum([self._db.llen(f"{self.prefix_error}:{tag}") for tag in self.tags]) 311 | } 312 | -------------------------------------------------------------------------------- /pyloom/tasks.py: -------------------------------------------------------------------------------- 1 | import furl 2 | import json 3 | import redis 4 | import random 5 | import requests 6 | import traceback 7 | import simplejson.errors 8 | from .utils import * 9 | from .errors import * 10 | from lxml import etree 11 | from typing import List 12 | from typing import Union 13 | from . import scheduler, errors 14 | from bs4 import BeautifulSoup, element 15 | from .buckets import LocalBucket, ShareBucket 16 | 17 | logger = logging.getLogger("tasks") 18 | 19 | 20 | class Queue(object): 21 | """队列控制器""" 22 | 23 | def __init__(self, db, name, url): 24 | self._spider = scheduler.Spider(db, name) 25 | self._queue = scheduler.Queue(db, name) 26 | self.url = url 27 | 28 | @property 29 | def detail(self): 30 | return self._queue.details 31 | 32 | @property 33 | def timeout(self): 34 | return self._spider.get_field("timeout") 35 | 36 | @timeout.setter 37 | def timeout(self, value): 38 | if not isinstance(value, (int, float)): 39 | raise errors.TaskError("timeout应为int或float型") 40 | self._spider.set_field("timeout", value) 41 | 42 | @property 43 | def interval(self): 44 | return self._spider.get_field("interval") 45 | 46 | @interval.setter 47 | def interval(self, value): 48 | if not isinstance(value, (int, float)): 49 | raise errors.TaskError("interval应为int或float型") 50 | self._spider.set_field("interval", value) 51 | 52 | def freeze(self, seconds): 53 | """暂停调度seconds秒""" 54 | last_pop_time = time.time() + seconds - self.interval 55 | self._spider.set_field("last_pop_time", last_pop_time) 56 | logger.info("暂停调度", seconds) 57 | 58 | def stop(self): 59 | """停止调度,爬虫状态更改为'stop'""" 60 | logger.info("爬虫状态更改为'stop'") 61 | self._spider.set_field("status", -2) 62 | 63 | def finish(self): 64 | """ 65 | 提前完成调度,爬虫状态更改为'finish' 66 | 默认情况下,当所有队列均为空时,爬虫状态自动变为'finish' 67 | """ 68 | logger.info("爬虫状态更改为'finish'") 69 | self._spider.set_field("status", 0) 70 | 71 | 72 | class UserAgent(object): 73 | _ua = None 74 | _browsers = None 75 | 76 | def __getitem__(self, item): 77 | if UserAgent._ua is None: 78 | filename = os.path.join(os.path.dirname(__file__), "user-agent.json") 79 | with open(filename, encoding='utf8') as f: 80 | UserAgent._ua = json.load(f) 81 | UserAgent._browsers = list(UserAgent._ua.keys()) 82 | if item == 'random': 83 | item = random.choice(UserAgent._browsers) 84 | return random.choice(UserAgent._ua[item]) 85 | 86 | # 便于IDE提示 87 | @property 88 | def chrome(self): 89 | return self["chrome"] 90 | 91 | @property 92 | def ie(self): 93 | return self["ie"] 94 | 95 | @property 96 | def safari(self): 97 | return self["safari"] 98 | 99 | @property 100 | def firefox(self): 101 | return self["firefox"] 102 | 103 | @property 104 | def android(self): 105 | return self["android"] 106 | 107 | @property 108 | def random(self): 109 | return self["random"] 110 | 111 | 112 | class CSS(object): 113 | def __init__(self, root, pattern=":root"): 114 | if isinstance(root, (element.Tag, type(None))): 115 | self._root = root 116 | elif isinstance(root, str): 117 | self._root = BeautifulSoup(root, "lxml") 118 | else: 119 | raise errors.TaskError(f"不支持从'{type(root)}'类型构造CSS") 120 | 121 | self._pattern = pattern 122 | self._default = ArgDefault 123 | 124 | def __bool__(self): 125 | return self._root is not None 126 | 127 | def __repr__(self): 128 | return f"CSS('{self._pattern}')" 129 | 130 | def one(self, pattern): 131 | node = self._root.select_one(pattern) 132 | return CSS(node, pattern) # type: CSS 133 | 134 | def many(self, pattern) -> List['CSS']: 135 | nodes = self._root.select(pattern) 136 | return [CSS(node, pattern) for node in nodes] 137 | 138 | def exist(self, pattern): 139 | return bool(self.one(pattern)) 140 | 141 | def default(self, value): 142 | self._default = value 143 | return self 144 | 145 | def text(self, regex=None, strip=True, separator=""): 146 | if self._root is None: 147 | if self._default is ArgDefault: 148 | raise errors.TaskError(f"未找到:{repr(self)}") 149 | else: 150 | # 默认值不校验格式,直接返回 151 | return self._default 152 | _text = self._root.get_text(separator, strip) 153 | if regex is None or re.match(regex, _text): 154 | return _text 155 | else: 156 | raise errors.TaskError(f"未通过正则校验:{regex}") 157 | 158 | def html(self): 159 | if self._root is None: 160 | if self._default is ArgDefault: 161 | raise errors.TaskError(f"未找到:{repr(self)}") 162 | else: 163 | # 默认值不校验格式,直接返回 164 | return self._default 165 | return str(self._root) 166 | 167 | @property 168 | def attrs(self): 169 | return self._root.attrs 170 | 171 | 172 | class XPath(object): 173 | def __init__(self, root, pattern="/*"): 174 | if isinstance(root, (etree._Element, type(None))): 175 | self._root = root 176 | elif isinstance(root, str): 177 | self._root = etree.HTML(root) 178 | else: 179 | raise errors.TaskError(f"不支持从'{type(root)}'类型构造XPath") 180 | 181 | self._pattern = pattern 182 | self._default = ArgDefault 183 | 184 | def __bool__(self): 185 | return self._root is not None 186 | 187 | def __repr__(self): 188 | return f"XPath('{self._pattern}')" 189 | 190 | def one(self, pattern): 191 | nodes = self._root.xpath(pattern) 192 | if nodes: 193 | return XPath(nodes[0]) 194 | else: 195 | return XPath(None) 196 | 197 | def many(self, pattern): 198 | nodes = self._root.xpath(pattern) 199 | return [XPath(node, pattern) for node in nodes] 200 | 201 | def exist(self, pattern): 202 | return bool(self.one(pattern)) 203 | 204 | def default(self, value): 205 | self._default = value 206 | return self 207 | 208 | def text(self, regex=None, strip=True): 209 | if self._root is None: 210 | if self._default is ArgDefault: 211 | raise errors.TaskError(f"未找到{repr(self)}") 212 | else: 213 | # 默认值不校验格式,直接返回 214 | return self._default 215 | _text = self._root.text 216 | _text = '' if _text is None else _text 217 | _text = _text.strip() if strip else _text 218 | if regex is None or re.match(regex, _text): 219 | return _text 220 | else: 221 | raise errors.TaskError(f"未通过正则校验:{regex}") 222 | 223 | @property 224 | def attrs(self): 225 | return self._root.attrib 226 | 227 | 228 | class Regex(object): 229 | def __init__(self, root): 230 | self._root = root 231 | 232 | def __bool__(self): 233 | return self._root is not None 234 | 235 | def many(self, pattern): 236 | return re.findall(pattern, self._root) 237 | 238 | 239 | class Response(object): 240 | def __init__(self, resp: requests.Response): 241 | self._resp = resp 242 | self.encoding = "utf-8" 243 | # 解析器 244 | self._css = None # type: CSS 245 | self._xpath = None # type: XPath 246 | self._json = None # type: dict 247 | self._re = None # type: Regex 248 | 249 | self.content = resp.content 250 | self.status_code = resp.status_code 251 | self.url = resp.url 252 | self.furl = furl.furl(resp.url) 253 | self.request = resp.request # type: requests.PreparedRequest 254 | self.history = resp.history # type: list 255 | self.cookies = resp.cookies # type: dict 256 | self.headers = resp.headers # type: dict 257 | 258 | @property 259 | def re(self) -> Regex: 260 | if not self._re: 261 | self._re = Regex(self.text) 262 | return self._re 263 | 264 | @property 265 | def text(self) -> str: 266 | self._resp.encoding = self.encoding 267 | return self._resp.text 268 | 269 | @property 270 | def json(self) -> dict: 271 | if self._json: 272 | return self._json 273 | try: 274 | self._json = self._resp.json() 275 | return self._json 276 | except simplejson.errors.JSONDecodeError: 277 | raise errors.JSONDecodeError 278 | 279 | @property 280 | def css(self) -> CSS: 281 | if self._css is None: 282 | self._css = CSS(self.content.decode(self.encoding)) 283 | return self._css 284 | 285 | @property 286 | def xpath(self) -> XPath: 287 | if self._xpath is None: 288 | self._xpath = XPath(self.content.decode(self.encoding)) 289 | return self._xpath 290 | 291 | def __repr__(self): 292 | return f"Response({self.status_code})" 293 | 294 | 295 | class Tracking(object): 296 | """数据埋点""" 297 | prefix = 'tracking' 298 | 299 | def __init__(self, name, db): 300 | self._name = name 301 | self._db = db 302 | 303 | def incr(self, field, amount=1): 304 | return self._db.incr(f"{self.prefix}:{self._name}:{field}", amount) 305 | 306 | def get(self, field): 307 | r = self._db.get(f"{self.prefix}:{self._name}:{field}") 308 | return int(r) if r else None 309 | 310 | @property 311 | def fields(self): 312 | return [i.decode().split(":", 2)[2] for i in self._db.keys(f"{self.prefix}:{self._name}:*")] 313 | 314 | 315 | class Client(object): 316 | """封装requests,便于包装响应包、掌管代理""" 317 | 318 | def __init__(self, name, db, address=None): 319 | self._address = address 320 | self._set_address(address) 321 | self.name = name 322 | self.headers = {} 323 | self._db = db # type: redis.StrictRedis 324 | self._session = requests 325 | self._reuse = False 326 | 327 | def session(self): 328 | """返回跨请求保留Cookie的客户端""" 329 | client = Client(self.name, self._db, self._address) 330 | client._session = requests.session() 331 | return client 332 | 333 | def request(self, method, url, **kwargs): 334 | try: 335 | headers = {**self.headers, **kwargs.pop("headers", {})} 336 | proxies = {**self.proxies, **kwargs.pop("proxies", {})} 337 | resp = self._session.request( 338 | method, url, 339 | headers=headers, 340 | proxies=proxies, 341 | **kwargs 342 | ) 343 | except requests.exceptions.Timeout as e: 344 | raise errors.Timeout(e) 345 | except requests.exceptions.ProxyError as e: 346 | raise errors.ProxyError(e) 347 | except requests.exceptions.RequestException as e: 348 | raise errors.RequestError(e) 349 | except Exception as e: 350 | raise e 351 | return Response(resp) 352 | 353 | def get(self, url, params=None, **kwargs): 354 | return self.request("get", url, params=params, **kwargs) 355 | 356 | def post(self, url, data=None, json=None, **kwargs): 357 | return self.request("post", url, data=data, json=json, **kwargs) 358 | 359 | def head(self, url, **kwargs): 360 | return self.request("head", url, **kwargs) 361 | 362 | def options(self, url, **kwargs): 363 | return self.request("options", url, **kwargs) 364 | 365 | def patch(self, url, data=None, **kwargs): 366 | return self.request("patch", url, data=data, **kwargs) 367 | 368 | def put(self, url, data=None, **kwargs): 369 | return self.request("put", url, data=data, **kwargs) 370 | 371 | def delete(self, url, **kwargs): 372 | return self.request("delete", url, **kwargs) 373 | 374 | def _set_address(self, address): 375 | if address: 376 | proxy = address.split(":", 2)[2] 377 | self.proxies = { 378 | "http": proxy, 379 | "https": proxy 380 | } 381 | self.proxy = proxy 382 | self.address = address 383 | logger.debug("设置代理", proxy) 384 | else: 385 | self.proxies = {} 386 | self.proxy = None 387 | self.address = None 388 | 389 | def reload_proxy(self) -> bool: 390 | """丢弃当前代理并更换新代理,若代理池已无可用代理,返回False""" 391 | recycle = [] 392 | try: 393 | while True: 394 | address = self._db.rpop(f"proxy:addresses:{self.name}") 395 | # 代理池空了 396 | if not address: 397 | raise TaskBreak(0) 398 | address = address.decode() # type: str 399 | _valid_at, _expire_at, _ = address.split(":", 2) 400 | valid_at, expire_at = float(_valid_at), float(_expire_at) 401 | # 未到可用时间,还回去 402 | if valid_at > time.time(): 403 | recycle.append(address) 404 | continue 405 | # 已到可用时间,但过期了,直接丢弃 406 | if expire_at < time.time(): 407 | continue 408 | self._set_address(address) 409 | return True 410 | finally: 411 | if recycle: 412 | self._db.lpush(f"proxy:addresses:{self.name}", *recycle) 413 | 414 | def reuse_proxy(self, freeze=0): 415 | """回收代理,并在freeze秒后可再次被分配""" 416 | # 只可reuse一次 417 | if self._reuse: 418 | return 419 | else: 420 | self._reuse = True 421 | if self.address: 422 | _, expire_at, proxy = self.address.split(":", 2) 423 | valid_at = time.time() + freeze 424 | self._db.lpush(f"proxy:addresses:{self.name}", f"{valid_at}:{expire_at}:{proxy}") 425 | logger.debug("回收代理", f"{valid_at}:{expire_at}:{proxy}") 426 | self._set_address(None) 427 | 428 | def __setattr__(self, key, value): 429 | if key in ['params', 'history']: 430 | setattr(self._session, key, value) 431 | else: 432 | super(Client, self).__setattr__(key, value) 433 | 434 | 435 | class Buckets(object): 436 | """数据存储""" 437 | 438 | def __init__(self, local, share): 439 | self.local = local # type: LocalBucket 440 | self.share = share # type: ShareBucket 441 | 442 | 443 | class Task(object): 444 | """描述爬虫行为的抽象类""" 445 | filters = [] 446 | 447 | def __init__(self, name, url, db, address): 448 | """ 449 | Args: 450 | name: 爬虫名 451 | url: 当前URL 452 | db: redis数据库(用户不可使用) 453 | address: 代理地址 454 | """ 455 | self._spider = scheduler.Spider(db, name) 456 | self._queue = scheduler.Queue(db, name) 457 | self._db = db # type: redis.StrictRedis 458 | 459 | self.url = url # type: str 460 | self.furl = furl.furl(url) 461 | self.name = name # type: str 462 | self.logger = logging.getLogger(name) 463 | self.client = Client(name, db, address) 464 | self.queue = Queue(db, name, url) 465 | self.ua = UserAgent() 466 | self.buckets = Buckets(LocalBucket.instance(name), ShareBucket(db, name)) 467 | self.args = self._spider.get_field("args") 468 | self.lock = self._db.lock # 分布式锁 469 | self.tracking = Tracking(name, db) 470 | self.result = None 471 | self.response = None # type: Response 472 | 473 | def on_download(self) -> Response: 474 | """下载并返回响应包""" 475 | raise NotImplementedError() 476 | 477 | def on_parse(self) -> dict: 478 | """提取并返回目标数据""" 479 | return {} 480 | 481 | def on_link(self) -> Union[list, dict]: 482 | """ 483 | 提取并返回新链接 484 | Returns: 485 | links: links可以是list和dict两种类型 486 | dict: 指定不同的优先级: {priority: urls} 487 | list: 将links中的url添加到优先级为1的队列中 488 | 相当于: {1: urls} 489 | """ 490 | 491 | def on_save(self): 492 | """存储数据""" 493 | self.logger.debug("on_save", self.result) 494 | 495 | def on_finish(self): 496 | """已完成""" 497 | 498 | def on_error(self, e) -> bool: 499 | """ 500 | 处理生命周期中抛出的异常(包括on_finish) 501 | Returns: 502 | True: 异常已被处理 503 | False: 异常无法处理 504 | """ 505 | return False 506 | 507 | 508 | def execute(task: Task): 509 | """ 510 | 运行task实例并处理所有异常 511 | Returns: 512 | links: {priority: urls} 513 | """ 514 | try: 515 | task.tracking.incr('on_download') 516 | task.response = task.on_download() 517 | task.tracking.incr('on_download_ok') 518 | task.result = task.on_parse() 519 | links = task.on_link() 520 | if isinstance(links, list): 521 | links = {3: links} 522 | elif links is None: 523 | links = {} 524 | elif not isinstance(links, dict): 525 | raise errors.TaskError(f"on_link返回值应是list或dict型,而非{type(links)}") 526 | task.on_save() 527 | task.on_finish() 528 | return links 529 | except errors.TaskFinish: 530 | logger.debug("TaskFinish", task.url) 531 | task.on_finish() 532 | return {} 533 | except errors.TaskBreak as e: 534 | logger.debug("TaskBack", e.priority, task.url) 535 | task._queue.insert(task.url, e.priority) 536 | return {} 537 | except errors.TaskError as e: 538 | task._queue.report_error(e.__class__.__name__, task.url) 539 | logger.warning("Task报告的异常", str(e), task.url) 540 | return {} 541 | except Exception as e: 542 | if task.on_error(e): 543 | return {} 544 | task._queue.report_error("unknown", task.url) 545 | logger.error(f"Task未处理的异常", "unknown", task.url) 546 | traceback.print_exc() 547 | return {} 548 | -------------------------------------------------------------------------------- /pyloom/user-agent.json: -------------------------------------------------------------------------------- 1 | { 2 | "chrome": [ 3 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", 4 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", 5 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", 6 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", 7 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", 8 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 9 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 10 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", 11 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", 12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", 13 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", 14 | "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", 15 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 16 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 17 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", 18 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", 19 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", 20 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", 21 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", 22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", 23 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", 24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", 25 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", 26 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", 27 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", 29 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", 30 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36", 31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", 32 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", 33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", 34 | "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36", 35 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", 36 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", 37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", 38 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", 39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", 40 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 41 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 42 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 43 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 45 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 46 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36", 47 | "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", 48 | "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", 49 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17", 50 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17", 51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15", 52 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14" 53 | ], 54 | "firefox": [ 55 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 56 | "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", 57 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0", 58 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0", 59 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0", 60 | "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0", 61 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0", 62 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0", 63 | "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0", 64 | "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0", 65 | "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3", 66 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0", 67 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0", 68 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0", 69 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0", 70 | "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0", 71 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0", 72 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0", 73 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0", 74 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0", 75 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0", 76 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0", 77 | "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0", 78 | "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0", 79 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1", 80 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1", 81 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0", 82 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0", 83 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0", 84 | "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0", 85 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0", 86 | "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0", 87 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0", 88 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 89 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0", 90 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", 91 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0", 92 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0", 93 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0", 94 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0", 95 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0", 96 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0", 97 | "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0", 98 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0", 99 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0", 100 | "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", 101 | "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0", 102 | "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1", 103 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0", 104 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6" 105 | ], 106 | "safari": [ 107 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A", 108 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25", 109 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", 110 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10", 111 | "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3", 112 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1", 113 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1", 114 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 115 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 116 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 117 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 118 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 119 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 120 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 121 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 122 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 123 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 124 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 125 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 126 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 127 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 128 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 129 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 130 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 131 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 132 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 133 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 134 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 135 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 136 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 137 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 138 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 139 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 140 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 141 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 142 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 143 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 144 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 145 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 146 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 147 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 148 | "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 149 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 150 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 151 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 152 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 153 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 154 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 155 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 156 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5" 157 | ], 158 | "ie": [ 159 | "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko", 160 | "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko", 161 | "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0", 162 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)", 163 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", 164 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)", 165 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)", 166 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)", 167 | "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)", 168 | "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)", 169 | "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)", 170 | "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)", 171 | "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))", 172 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 173 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)", 174 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)", 175 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7", 176 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)", 177 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)", 178 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 179 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 180 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)", 181 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0", 182 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)", 183 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)", 184 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)", 185 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)", 186 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)", 187 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205", 188 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)", 189 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)", 190 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)", 191 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)", 192 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 193 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)", 194 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)", 195 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)", 196 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)", 197 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)", 198 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 199 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)", 200 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)", 201 | "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)", 202 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)", 203 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)", 204 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8", 205 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)", 206 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)", 207 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)", 208 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)" 209 | ], 210 | "android": [ 211 | "android Mozilla/5.0 (Linux; Android 8.0.0; ATU-AL10 Build/HUAWEIATU-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36 phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/hw", 212 | "android Mozilla/5.0 (Linux; Android 8.1.0; ONEPLUS A5000 Build/OPM1.171019.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36 phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/oppo" 213 | ] 214 | } 215 | -------------------------------------------------------------------------------- /pyloom/utils.py: -------------------------------------------------------------------------------- 1 | """小工具""" 2 | import re 3 | import os 4 | import sys 5 | import time 6 | import uuid 7 | import types 8 | import logging 9 | import readline 10 | import functools 11 | import itertools 12 | import importlib 13 | from pyloom.errors import * 14 | from importlib.machinery import SourceFileLoader 15 | from logging.handlers import TimedRotatingFileHandler 16 | 17 | logger = logging.getLogger("utils") 18 | 19 | 20 | class ArgDefault(object): 21 | """默认参数""" 22 | 23 | def __bool__(self): 24 | return False 25 | 26 | 27 | def patch_logger_format(): 28 | """使logger支持用多个参数构造日志内容""" 29 | log_bak = logging.Logger._log 30 | 31 | def log(self, level, msg, *args): 32 | gap = ' ' 33 | out = str(msg) + gap 34 | for value in args[0]: 35 | out = out + str(value) + gap 36 | log_bak(self, level, out, []) 37 | 38 | logging.Logger._log = log 39 | 40 | 41 | def patch_handler_color(handler): 42 | """使handler支持根据日志级别输出彩色日志""" 43 | emit_bak = handler.emit 44 | 45 | def emit(*args): 46 | level = args[0].levelno 47 | if level >= 50: 48 | color = '\x1b[31m' # red, critical 49 | elif level >= 40: 50 | color = '\x1b[31m' # red, error 51 | elif level >= 30: 52 | color = '\x1b[33m' # yellow, warning 53 | elif level >= 20: 54 | color = '\x1b[32m' # green, info 55 | elif level >= 10: 56 | color = '\x1b[35m' # pink, debug 57 | else: 58 | color = '\x1b[0m' # normal 59 | args[0].msg = color + args[0].msg + '\x1b[0m' 60 | return emit_bak(*args) 61 | 62 | handler.emit = emit 63 | 64 | 65 | class RedisScripts(object): 66 | """管理redis-lua脚本""" 67 | _scripts = {} 68 | 69 | @classmethod 70 | def load(cls, db): 71 | path = os.path.join(os.path.dirname(__file__), 'lua') 72 | for filename in os.listdir(path): 73 | lua_file = os.path.join(path, filename) 74 | with open(lua_file, encoding="utf-8") as f: 75 | sha1 = db.script_load(f.read()) 76 | command = filename.split('.')[0] 77 | RedisScripts._scripts[command] = sha1 78 | logger.info("缓存Lua脚本", command, sha1) 79 | 80 | @classmethod 81 | def sha1(cls, command): 82 | return RedisScripts._scripts[command] 83 | 84 | 85 | def dict_merge(base: dict, delta: dict, check_not_none=True) -> dict: 86 | """ 87 | 将delta递归合并至base,覆盖同名字段 88 | 若is_not_none为True, 89 | 合并后不应有值为ConfigNotNone,否则抛出ConfigNotNone异常 90 | Example: 91 | # 递归合并,修改实参 92 | >>> base = {'redis': {'host': '127.0.0.1', 'port': 6379}} 93 | >>> delta = {'redis': {'host': '192.168.1.1'}} 94 | >>> dict_merge(base, delta) 95 | {'redis': {'host': '192.168.1.1', 'port': 6379}} 96 | >>> base 97 | {'redis': {'host': '192.168.1.1', 'port': 6379}} 98 | 99 | # 参数check_not_none 100 | >>> base = {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}} 101 | >>> delta = {'redis': {'host': '192.168.1.1'}} 102 | >>> dict_merge(base, delta, True) 103 | Traceback (most recent call last): 104 | ... 105 | pyloom.errors.ConfigNotNone: 缺少配置项:'port' 106 | >>> base = {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}} 107 | >>> dict_merge(base, delta, False) 108 | {'redis': {'host': '192.168.1.1', 'port': }} 109 | """ 110 | if not isinstance(base, dict): 111 | return delta 112 | common_keys = set(base).intersection(delta) 113 | new_keys = set(delta).difference(common_keys) 114 | for key in common_keys: 115 | base[key] = dict_merge(base[key], delta[key], check_not_none) 116 | for key in new_keys: 117 | base[key] = delta[key] 118 | if check_not_none: 119 | for key in base: 120 | if base[key] is ConfigNotNone: 121 | raise ConfigNotNone(key) 122 | return base 123 | 124 | 125 | def retry(tries=-1, delay=1, max_delay=None, backoff=0, catches=None, error=None): 126 | """ 127 | 自动重试 128 | 129 | 当delay=1,backoff=0时,依此休眠: 130 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 131 | 当delay=1,backoff=1时,依此休眠: 132 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 133 | 当delay=1,backoff=2时,依此休眠: 134 | [1, 2, 5, 10, 17, 26, 37, 50, 65, 82] 135 | 136 | Args: 137 | tries: 重试次数,(-1:不限重试次数) 138 | delay: 初始重试秒数 139 | max_delay: 最大重试秒数(None:不限) 140 | backoff: 退避指数 141 | catches: 可被捕捉的异常(RetryError始终可用) 142 | error: 达到最大重试次数时抛出的异常(默认RetryExceeded) 143 | """ 144 | if catches is None: 145 | catches = [] 146 | 147 | def decorator(func): 148 | @functools.wraps(func) 149 | def wrapper(*args, **kwargs): 150 | # 记数生成器 151 | if tries >= 0: 152 | count = range(tries) 153 | else: 154 | count = itertools.count() 155 | # 处理重试 156 | for i in count: 157 | setattr(wrapper, "count", i) 158 | try: 159 | return func(*args, **kwargs) 160 | except (RetryError, *catches): 161 | if backoff == 0: 162 | sleep = delay 163 | else: 164 | sleep = delay + i ** backoff 165 | if max_delay: 166 | sleep = min(sleep, max_delay) 167 | time.sleep(sleep) 168 | # 重试次数超限 169 | else: 170 | if error is None: 171 | raise RetryExceeded 172 | else: 173 | raise error 174 | 175 | return wrapper 176 | 177 | return decorator 178 | 179 | 180 | def template_input(template): 181 | """ 182 | 在命令行提示用户输入配置 183 | Args: 184 | template: 配置模板 185 | 例如完成员工信息填写: 186 | ArgDefault表示必填参数,无默认值 187 | [ 188 | { 189 | "name": 配置名, 190 | "title": 配置标题, 191 | "example": 示例, 192 | "default": 默认值(留空表示必填参数), 193 | "note": 提示信息, 194 | "type": 类型转换函数 195 | } 196 | ] 197 | """ 198 | configs = {} 199 | for fields in template: 200 | name = fields['name'] 201 | default = fields.get('default', ArgDefault) 202 | example = fields.get('example', ArgDefault) 203 | note = fields.get('note', ArgDefault) 204 | title = fields.get('title', name) 205 | regex = fields.get('regex', ArgDefault) 206 | _type = fields.get('type', ArgDefault) 207 | _range = fields.get('range', ArgDefault) 208 | if _type is not ArgDefault: 209 | output = f"{title}[{_type.__name__}]\n" 210 | else: 211 | output = f"{title}\n" 212 | if example is not ArgDefault: 213 | output += f"示例: {example}\n" 214 | if note is not ArgDefault: 215 | output += f"提示: {note}\n" 216 | output += '➜ ' 217 | first = True 218 | while True: 219 | if first: 220 | var = input(output) 221 | first = False 222 | else: 223 | var = input('➜ ') 224 | if var: 225 | # 类型检查 226 | if _type is not ArgDefault: 227 | try: 228 | var = _type(var) 229 | except ValueError: 230 | print(f"参数类型有误,请重试") 231 | continue 232 | # 范围检查 233 | if _range is not ArgDefault and var not in _range: 234 | print(f"参数范围有误,请重试") 235 | continue 236 | # 正则检查 237 | if regex is not ArgDefault and not re.match(regex, var): 238 | print(f"参数格式有误,请重试") 239 | continue 240 | break 241 | elif not var and default is not ArgDefault: 242 | var = default 243 | break 244 | else: 245 | print(f"参数不可留空,请重试") 246 | configs[name] = var 247 | return configs 248 | 249 | 250 | def load_py_configs(file) -> dict: 251 | """ 252 | 加载PY格式的配置文件,当配置文件为空时,返回{} 253 | Args: 254 | file: 配置文件路径 255 | """ 256 | if not os.path.exists(file): 257 | raise ConfigFileNotFoundError(file) 258 | m = SourceFileLoader(uuid.uuid4().hex, file).load_module() 259 | return {k: v for k, v in vars(m).items() if not k.startswith('__')} 260 | 261 | 262 | def load_spider_configs(path) -> dict: 263 | """ 264 | 加载爬虫配置 265 | Args: 266 | path: 爬虫目录 267 | """ 268 | _configs = { 269 | "seeders": ConfigNotNone, 270 | "interval": 3, 271 | "timeout": 120, 272 | "precision": 0.0001, 273 | "args": {} 274 | } 275 | conf = os.path.join(path, 'configs.py') 276 | if not os.path.exists(conf): 277 | raise ConfigFileNotFoundError(f"ERR: 未找到爬虫配置:'{conf}'") 278 | return dict_merge(_configs, load_py_configs(conf)) 279 | 280 | 281 | def tail(file): 282 | """模仿linux中的tail命令""" 283 | try: 284 | with open(file, 'rb') as f: 285 | for i in range(1, 11): 286 | try: 287 | f.seek(-(i ** 3), 2) 288 | except OSError: 289 | f.seek(-((i - 1) ** 3), 2) 290 | break 291 | while True: 292 | line = f.readline() 293 | if not line: 294 | time.sleep(0.1) 295 | continue 296 | try: 297 | yield line.decode('utf8') 298 | except UnicodeDecodeError: 299 | time.sleep(0.1) 300 | continue 301 | except KeyboardInterrupt: 302 | pass 303 | -------------------------------------------------------------------------------- /pyloom/worker.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import signal 3 | import traceback 4 | import threading 5 | import multiprocessing 6 | from . import buckets 7 | from .utils import * 8 | from .tasks import Task, execute 9 | from .scheduler import Spider, Queue 10 | 11 | logger = logging.getLogger("worker") 12 | 13 | 14 | def worker_process(redis_conf, spiders, threads, token_curr, token_new): 15 | """ 16 | Worker子进程,负责启动线程 17 | Args: 18 | redis_conf: redis数据库 19 | spiders: 所有爬虫配置表,{name: (path, version)} 20 | threads: 线程数 21 | token_curr: 新建进程时的token 22 | token_new: 父进程中最新的token 23 | 当token_curr与token_new不同时,表示父进程已更新了路由, 24 | 线程在完成当前生命周期后需自行退出 25 | """ 26 | logger.debug("Worker进程已启动") 27 | # Manager的共享变量在并发启动过多进程时会出现ConnectionRefusedError 28 | for _ in range(60): 29 | try: 30 | spiders.items() 31 | break 32 | except Exception: 33 | pass 34 | else: 35 | logger.fatal("Worker进程退出,spiders超时未就绪") 36 | return 37 | 38 | thread_ids = [] 39 | # 构造路由,{name: [[regex, task]...]} 40 | routers = {} 41 | for name, (path, version) in spiders.items(): 42 | tasks = import_tasks(path) 43 | if tasks: 44 | routers[name] = tasks 45 | logger.info("载入爬虫成功", name, version) 46 | else: 47 | logger.info("载入爬虫失败,未发现合规Task类", name, version) 48 | # 启动线程 49 | try: 50 | logger.info("正在启动Worker线程") 51 | signal.signal(signal.SIGINT, signal.SIG_IGN) # 忽略Ctrl+C 52 | for thread_index in range(threads): 53 | thread = threading.Thread( 54 | target=worker_thread, 55 | args=(redis_conf, routers, token_curr, token_new, thread_index) 56 | ) 57 | thread.start() 58 | thread_ids.append(thread) 59 | logger.info("Worker线程启动成功") 60 | except Exception as e: 61 | logger.fatal("Worker进程结束,启动Worker线程时出现异常", e, '\n', traceback.format_exc()) 62 | return 63 | 64 | for i in itertools.count(): 65 | try: 66 | # 清理进程内的过期键 67 | if i % 500 == 0: 68 | count = buckets.LocalBucket.purge() 69 | if count: 70 | logger.debug(f"完成清理LocalBucket", count) 71 | # 线程全部退出后,结束进程 72 | if not any([t.is_alive() for t in thread_ids]): 73 | logger.info("Worker进程结束,线程已全部退出") 74 | return 75 | time.sleep(2) 76 | except Exception as e: 77 | logger.fatal("Worker进程异常", e, '\n', traceback.format_exc()) 78 | time.sleep(5) 79 | 80 | 81 | def worker_thread(redis_conf, routers, token_curr, token_new, thread_index): 82 | """ 83 | 循环:申请任务->执行任务->上报结果 84 | 线程内捕捉所有异常,永不退出(Ctrl+C除外) 85 | """ 86 | logger.debug("Worker线程已启动") 87 | db = redis.StrictRedis.from_url(redis_conf) 88 | pop_failure_count = 0 89 | while True: 90 | try: 91 | # 结束线程 92 | try: 93 | if token_curr != token_new.value: 94 | logger.info("Worker线程结束,收到退出信号") 95 | return 96 | except ConnectionRefusedError: 97 | logger.debug("Token未就绪") 98 | time.sleep(1) 99 | continue 100 | except (BrokenPipeError, FileNotFoundError, EOFError): 101 | logger.info("Worker线程结束,Token已关闭") 102 | return 103 | # 从队列中弹出URL 104 | if not routers: 105 | logger.info("本地爬虫列表为空,等待加载爬虫") 106 | while not routers: 107 | time.sleep(1) 108 | keys = list(routers.keys()) 109 | url, name, address = Queue.pop(db, keys) 110 | if not url: 111 | if pop_failure_count % 20 == 0: # 避免日志过多 112 | logger.debug("暂无已就绪任务,稍后重试") 113 | time.sleep(thread_index / 10 + 0.1) 114 | pop_failure_count += 1 115 | continue 116 | logger.info("获得任务", name, url, address) 117 | pop_failure_count = 0 118 | # 匹配Task类并执行 119 | tasks = routers.get(name, None) 120 | queue = Queue(db, name) 121 | if tasks is None: 122 | logger.warning("爬虫匹配失败", name, url) 123 | queue.report_error("none_spider", url) 124 | continue 125 | for regex, task_cls in tasks: 126 | if not regex.match(url): 127 | continue 128 | # 实例化Task并执行 129 | task = task_cls(name, url, db, address) 130 | links = execute(task) 131 | for priority, urls in links.items(): 132 | count = queue.add(urls, priority) 133 | logger.debug("添加任务", priority, f"{count}/{len(urls)}") 134 | logger.debug("报告任务完成", queue.report_finish(url), url) 135 | break 136 | else: 137 | logger.warning("任务匹配失败", name, url) 138 | queue.report_error("none_task", url) 139 | except Exception as e: 140 | logger.error("Worker线程异常", e, '\n', traceback.format_exc()) 141 | time.sleep(5) 142 | 143 | 144 | def import_tasks(path): 145 | """ 146 | 扫描并导入爬虫模块中的Tasks 147 | Return: 148 | [[regex, task]...] 149 | """ 150 | tasks = [] 151 | # 导入模块 152 | parent = os.path.dirname(path) 153 | if parent not in sys.path: 154 | sys.path.append(parent) 155 | basename = os.path.basename(path) 156 | try: 157 | logger.debug("加载爬虫模块", basename) 158 | _module = importlib.import_module(basename) 159 | except Exception as e: 160 | logger.error("加载爬虫模块异常", e, '\n', traceback.format_exc()) 161 | return [] 162 | # 扫描模块中合规的Task子类 163 | # 何为合规? 164 | # 1.Task的子类; 2.filters成员; 3.导入无异常; 4.名称不以'__'开头 165 | for name in dir(_module): 166 | if name.startswith("__"): 167 | continue 168 | var = getattr(_module, name) 169 | try: 170 | is_subclass = issubclass(var, Task) 171 | except TypeError: 172 | continue 173 | try: 174 | if is_subclass: 175 | if hasattr(var, 'filters') and isinstance(var.filters, (list, tuple, str)): 176 | if isinstance(var.filters, str): 177 | filters = [var.filters] 178 | else: 179 | filters = var.filters 180 | for regex in filters: 181 | tasks.append([re.compile(regex), var]) 182 | logger.info("导入Task类", var.__name__) 183 | else: 184 | logger.warning("忽略Task类", var.__name__, "filters不合规") 185 | continue 186 | else: 187 | continue 188 | except Exception as e: 189 | logger.error("加载Task类异常", e, '\n', traceback.format_exc()) 190 | continue 191 | return tasks 192 | 193 | 194 | def start(spider_path, redis_conf, spider_configs, proxies, processes, threads): 195 | """ 196 | 重置爬虫状态后运行指定爬虫 197 | Args: 198 | spider_path: 爬虫目录 199 | redis_conf: Redis配置 200 | spider_configs: 爬虫配置 201 | proxies: 使用代理运行 202 | processes: 进程数量 203 | threads: 每个进程的线程数量 204 | """ 205 | logger.info("正在启动爬虫") 206 | db = redis.StrictRedis.from_url(redis_conf) 207 | name = os.path.basename(spider_path) # 取目录名为爬虫名 208 | RedisScripts.load(db) 209 | spider = Spider(db, name) 210 | # 注册爬虫/更新同名爬虫配置 211 | logger.info("注册爬虫", name) 212 | logger.info("爬虫配置", spider_configs) 213 | spider.upsert(spider_configs['seeders'], spider_configs['interval'], 214 | spider_configs['timeout'], spider_configs['precision'], 215 | spider_configs['args'], proxies, time.time()) 216 | # 重置爬虫状态 217 | status = spider.get_field("status") 218 | if status != 10: 219 | spider.set_field("status", 10) 220 | logger.info(f"重置爬虫状态", "{status} -> 10") 221 | # 回滚'timeout'异常队列 222 | queue = Queue(db, name) 223 | logger.debug("清理Redis") 224 | Queue.purge(db) 225 | logger.info("回滚超时任务") 226 | queue.rollback_tag("timeout", 0) 227 | # 启动Worker 228 | logger.info("正在启动Worker") 229 | spiders = multiprocessing.Manager().dict({name: [spider_path, 0]}) 230 | pool = [] 231 | token = multiprocessing.Manager().Value('d', 0) 232 | for _ in range(processes): 233 | p = multiprocessing.Process( 234 | target=worker_process, 235 | args=(redis_conf, spiders, threads, token.value, token) 236 | ) 237 | p.start() 238 | pool.append(p) 239 | logger.info("Worker启动成功") 240 | try: 241 | # 循环检查爬虫状态,当爬虫停止时终止运行 242 | while True: 243 | time.sleep(0.2) 244 | spider = Spider(db, name) 245 | status = spider.get_field("status") 246 | if status < 10: 247 | logger.info("爬虫停止,当前状态为:", Spider.status.get(status, "未知")) 248 | break 249 | except KeyboardInterrupt: 250 | logger.info("收到Ctrl+C", 'main') 251 | for p in pool: 252 | p.terminate() 253 | logger.info("爬虫停止", "Ctrl+C") 254 | except Exception as e: 255 | logger.error("爬虫停止", "未知异常", e, '\n', traceback.format_exc()) 256 | 257 | 258 | def start_all(redis_conf, spiders_path, processes, threads): 259 | """ 260 | 启动所有爬虫 261 | Args: 262 | redis_conf: Redis配置 263 | spiders_path: 放置所有爬虫的目录 264 | processes: 进程数量 265 | threads: 每个进程的线程数量 266 | """ 267 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='pyloom', 5 | version='0.0.7', 6 | packages=['pyloom'], 7 | url='https://pyloom.com', 8 | license='https://opensource.org/licenses/MIT', 9 | author='pyloom', 10 | author_email='ss@uutoto.com', 11 | description='古老的东方有一条虫,它的名字叫爬龙。', 12 | entry_points={ 13 | 'console_scripts': [ 14 | 'pyloom = pyloom.entry:main' 15 | ] 16 | }, 17 | install_requires=[ 18 | 'redis', 19 | 'cryptography >= 2.2.1', 20 | 'requests[security, socks] >= 2.10.0', 21 | 'bs4', 22 | 'lxml', 23 | 'furl', 24 | 'simplejson', 25 | 'checksumdir', 26 | 'docutils', # python-daemon的依赖 27 | 'python-daemon', 28 | 'tabulate', 29 | 'psutil' 30 | ] 31 | ) 32 | -------------------------------------------------------------------------------- /spiders/DouBan250/README.md: -------------------------------------------------------------------------------- 1 | ## 豆瓣TOP250电影爬虫 2 | 3 | 用于演示编写最基本的爬虫、测试新功能。 -------------------------------------------------------------------------------- /spiders/DouBan250/__init__.py: -------------------------------------------------------------------------------- 1 | from pyloom.tasks import * 2 | 3 | 4 | class DouBanTask(Task): 5 | filters = ["^https://movie.douban.com/top250(\?start=\d+)?$"] 6 | 7 | def on_download(self): 8 | return self.client.get( 9 | url=self.url, 10 | headers={ 11 | "Host": "movie.douban.com", 12 | "User-Agent": self.ua.chrome 13 | } 14 | ) 15 | 16 | def on_parse(self): 17 | nodes = self.response.css.many("div.article ol > li") 18 | return [{ 19 | "title": node.one("span.title").text(), 20 | "rating": node.one("span.rating_num").text(), 21 | "quote": node.one("p.quote > span.inq").text() 22 | } for node in nodes] 23 | 24 | def on_link(self): 25 | if self.url.endswith("top250"): 26 | return [f"{self.url}?start={i}" for i in range(25, 250, 25)] 27 | 28 | def on_save(self): 29 | for movie in self.result: 30 | self.logger.info("抓到电影", movie) 31 | -------------------------------------------------------------------------------- /spiders/DouBan250/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "https://movie.douban.com/top250" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = 5 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/DouBanBooks/README.md: -------------------------------------------------------------------------------- 1 | ## 豆瓣图书爬虫 2 | 3 | ### 图书信息(BookDetailsTask) 4 | 5 | | 字段 | 示例 | 说明 | 6 | | -------------------- | ------------------------------------------------------------ | -------- | 7 | | result.title | 社会研究方法 | 书名 | 8 | | result.cover | https://img3.doubanio.com/view/subject/l/public/s2932505.jpg | 封面 | 9 | | result.info | {'作者': '[美]劳伦斯·纽曼', '出版社', '中国人民大学出版社'} | 基本信息 | 10 | | result.rating_num | 9.0 | 评分 | 11 | | result.rating_people | 202 | 评分人数 | 12 | | result.intro | 迄今所见中文社会研究方法书中最好的…… | 简介 | 13 | | result.tags | ['社会学', '研究方法'] | 标签 | 14 | 15 | ```python 16 | self.result = { 17 | 'title': '社会研究方法', 18 | 'cover': 'https://img3.doubanio.com/view/subject/l/public/s2932505.jpg', 19 | 'info': { 20 | '作者': '[美]劳伦斯·纽曼', 21 | '出版社': '中国人民大学出版社', 22 | '副标题': '定性和定量的取向', 23 | '原作名': 'Basics of Social Research: Qualitative and Quantitative Approaches', 24 | '译者': '郝大海', 25 | '出版年': '2007', 26 | '页数': '809', 27 | '定价': '89.80元', 28 | '丛书': '社会学译丛·经典教材系列', 29 | 'ISBN': '9787300075648' 30 | }, 31 | 'rating_num': '9.0', 32 | 'rating_people': '202', 33 | 'intro': '迄今所见中文社会研究方法书中最好的一本,极力推荐研究生教学中采用。理清了许多问题,对定性和定量的对比非常精彩。', 34 | 'tags': ['社会学', '研究方法', '方法论', '社会研究方法', '定性', '教材', '纽曼', '定量'] 35 | } 36 | ``` 37 | 38 | + 基本信息(result.info),每本书的字段不固定。 39 | 40 | -------------------------------------------------------------------------------- /spiders/DouBanBooks/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import * 2 | -------------------------------------------------------------------------------- /spiders/DouBanBooks/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "https://book.douban.com/tag/?view=cloud" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = -10 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.0001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/DouBanBooks/tasks.py: -------------------------------------------------------------------------------- 1 | from pyloom.errors import * 2 | from pyloom.tasks import Task, CSS, retry 3 | 4 | 5 | class BaseTask(Task): 6 | @retry(10, 0) 7 | def on_download(self): 8 | """下载页面""" 9 | try: 10 | response = self.client.get( 11 | url=self.url, 12 | allow_redirects=False, 13 | headers={ 14 | "Host": "book.douban.com", 15 | "User-Agent": self.ua.chrome 16 | }, 17 | timeout=8 18 | ) 19 | except (ProxyError, RequestError): 20 | self.client.reload_proxy() 21 | raise RetryError 22 | # 检查是否被封禁 23 | if response.status_code == 200: 24 | s = 'window.location.href="https://sec.douban.com/a' 25 | if s in response.text: 26 | self.logger.warning("IP被封禁:200", self.client.address) 27 | self.client.reuse_proxy(300) 28 | else: 29 | self.client.reuse_proxy() 30 | return response 31 | elif response.status_code == 302: 32 | self.logger.warning("IP被封禁:302", self.client.address) 33 | self.client.reuse_proxy(300) 34 | else: 35 | self.logger.warning("请求错误", response.status_code) 36 | self.client.reload_proxy() 37 | raise RetryError 38 | 39 | def parse_tag_urls(self): 40 | """提取页面中所有的标签链接""" 41 | # 获取所有标签详情页的相对路径 42 | paths = self.response.re.many("/tag/\w+") 43 | # 构造每个标签前50页的标签详情页链接,优先级为2:最低 44 | return [ 45 | f"https://book.douban.com{path}?start={i*20}&type=R" 46 | for path in paths for i in range(50) 47 | ] 48 | 49 | 50 | class BookDetailsTask(BaseTask): 51 | """图书详情页""" 52 | filters = ["https://book.douban.com/subject/(\d+)/"] 53 | 54 | def on_parse(self): 55 | css = self.response.css 56 | # 书籍基本信息 57 | info = {} 58 | for line in css.one("div#info").html().split("
"): 59 | items = [ 60 | ' '.join(s.split()) 61 | for s in CSS(line).text(separator=" ").split(":", 1) 62 | if s.strip() 63 | ] 64 | if len(items) == 2: 65 | info[items[0]] = items[1] 66 | result = { 67 | "title": css.one("h1 > span").text(), 68 | "cover": css.one("div#mainpic img").attrs.get("src", None), 69 | "info": info, 70 | "rating_num": css.one("div.rating_self > strong.rating_num").text() or None, 71 | "rating_people": css.one("a.rating_people > span").default(None).text(), 72 | "intro": css.one("div#link-report div.intro > p").default(None).text(separator="\n"), 73 | "tags": [n.text() for n in css.many("div#db-tags-section a")], 74 | } 75 | return result 76 | 77 | def on_link(self): 78 | books = self.response.re.many("https://book.douban.com/subject/\d+/") 79 | # 指定优先级 80 | return { 81 | 0: books, 82 | 4: self.parse_tag_urls() 83 | } 84 | 85 | def on_save(self): 86 | self.logger.info("抓到新书", self.result) 87 | 88 | 89 | class TagsTask(BaseTask): 90 | """热门标签页""" 91 | filters = ["https://book.douban.com/tag/\?view=cloud"] 92 | 93 | def on_link(self): 94 | return self.parse_tag_urls() 95 | 96 | 97 | class TagDetailsTask(BaseTask): 98 | """标签详情页""" 99 | filters = ["https://book.douban.com/tag/(\w+)\?start=(\d+)&type=R"] 100 | 101 | def on_link(self): 102 | books = self.response.re.many("https://book.douban.com/subject/\d+/") 103 | return { 104 | 0: books, 105 | 4: self.parse_tag_urls() 106 | } 107 | -------------------------------------------------------------------------------- /spiders/LaGou/README.md: -------------------------------------------------------------------------------- 1 | ## 拉钩爬虫 2 | 3 | ### 职位详情(JobDetails) 4 | 5 | | 字段 | 示例 | 说明 | 6 | | ------------------ | -------------------------------------------- | --------------------- | 7 | | result._id | 5080106 | 职位id | 8 | | result.title | 演员实习生 | 名称 | 9 | | result.label | ['移动互联网', '广告营销'] | 标签 | 10 | | result.job_request | 2k-4k/上海 /经验应届毕业生 /大专及以上 /实习 | 要求 | 11 | | result.advantage | 周末双休,地铁周边,做五休二,氛围融洽 | 职位诱惑 | 12 | | result.job_bt | 职位描述:岗位职责:1参与公司广告... | 职位描述 | 13 | | result.work_addr | 上海-徐汇区- 桂林路396号3号楼 | 工作地址 | 14 | | result.status | 0 | 状态,0:进行中 1:结束 | 15 | | result.job_company | 乐推(上海)文化传播有限公司 | 公司名字 | 16 | | result.type | 移动互联网,广告营销领域 | 类型 | 17 | | result.time | 2018-09-02 | 发布时间 | 18 | 19 | ```python 20 | self.result = { 21 | '_id': '5080106', 22 | 'title': '演员实习生', 23 | 'label': ['移动互联网', '广告营销'], 24 | 'job_request': '2k-4k/上海 /经验应届毕业生 /大专及以上 /实习', 25 | 'advantage': '周末双休,地铁周边,做五休二,氛围融洽', 26 | 'job_bt': '职位描述:岗位职责:1参与公司广告和短剧的拍摄;2负责公司项目前期筹备等的相关工作;3出演抖音广告与搞笑视频。任职要求:1长相甜美,外形清新亮丽,镜头感强,有强烈的表现力;2专科以上学历,表演系专业优先;3性格活泼开朗、思维活跃、为人正直;4工作态度积极;5仅仅招收女演员。', 27 | 'work_addr': '上海-徐汇区- 桂林路396号3号楼', 28 | 'status': 0, 29 | 'job_company': '乐推(上海)文化传播有限公司', 30 | 'type': '移动互联网,广告营销领域', 31 | 'time': '2018-09-02' 32 | } 33 | ``` 34 | 35 | 36 | 37 | ### 公司详情(GongSiDetails) 38 | 39 | | 字段 | 示例 | 说明 | 40 | | ------------------------ | -------------------------- | -------- | 41 | | result._id | 324 | 公司id | 42 | | result.company_abbr | 爱立示 | 简称 | 43 | | result.company_full_name | 慈溪爱立示信息科技有限公司 | 全称 | 44 | | result.type | 信息安全,数据服务 | 类型 | 45 | | result.process | 未融资 | 融资状态 | 46 | | result.number | 15-50人 | 人数 | 47 | | result.address | 北京 | 公司地点 | 48 | | result.label | ['技能培训', '岗位晋升'] | 公司标签 | 49 | | result.website | http://www.alstru.com | 公司网站 | 50 | 51 | ```python 52 | self.result = { 53 | '_id': '324', 54 | 'company_abbr': '爱立示', 55 | 'company_full_name': '慈溪爱立示信息科技有限公司', 56 | 'type': '信息安全,数据服务', 57 | 'process': '未融资', 58 | 'number': '15-50人', 59 | 'address': '北京', 60 | 'label': ['技能培训', '岗位晋升', '扁平管理', '领导好', '五险一金', '弹性工作'], 61 | 'website': 'http://www.alstru.com' 62 | } 63 | ``` 64 | 65 | -------------------------------------------------------------------------------- /spiders/LaGou/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | from .tasks import * 3 | 4 | 5 | def reactor(): 6 | return time.time() 7 | -------------------------------------------------------------------------------- /spiders/LaGou/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = 3600 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/LaGou/tasks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import uuid 4 | import random 5 | import string 6 | import datetime 7 | from pyloom import tasks, errors 8 | 9 | 10 | class LaGouJobTask(tasks.Task): 11 | @staticmethod 12 | def get_random(const): 13 | return "".join(random.sample(string.ascii_letters + string.digits, const)) 14 | 15 | @staticmethod 16 | def get_uuid(): 17 | return time.strftime("%Y%m%d%H%M%S-", time.localtime()) + str(uuid.uuid1()) 18 | 19 | def get_cookies(self): 20 | cookies = { 21 | 'LGUID': self.get_uuid(), 22 | 'user_trace_token': '20180705084851-8a154ee4-0f2b-406d-9130-e835805b49ee', 23 | 'X_HTTP_TOKEN': 'c2e6c0237f5362aca8d13748cfdd8274', 24 | 'JSESSIONID': self.get_random(47).upper(), 25 | 'SEARCH_ID': self.get_random(32).lower(), 26 | 'LGSID': self.get_uuid(), 27 | 'PRE_UTM': '', 28 | 'PRE_HOST': '', 29 | 'PRE_SITE': '', 30 | 'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com', 31 | 'LGRID': self.get_uuid() 32 | } 33 | return cookies 34 | 35 | @tasks.retry(5, 0) 36 | def on_download(self): 37 | """下载页面""" 38 | if self.buckets.local.get('cookies') is None: 39 | self.buckets.local.set('cookies', self.get_cookies()) 40 | cookies = self.buckets.local.get('cookies') 41 | try: 42 | response = self.client.get( 43 | url=self.url, 44 | allow_redirects=False, 45 | headers={ 46 | "User-Agent": self.ua.chrome, 47 | "Accept-Encoding": "gzip", 48 | "Host": "www.lagou.com", 49 | "Referer": "https://www.lagou.com/jobs/list_" 50 | }, 51 | cookies=cookies 52 | ) 53 | except errors.ProxyError: 54 | self.logger.info("代理错误") 55 | raise errors.RetryError 56 | except errors.RequestError: 57 | self.logger.info("请求错误") 58 | raise errors.RetryError 59 | 60 | if response.status_code == 301: 61 | # 页面被删除 62 | raise errors.TaskFinish 63 | elif response.status_code == 302: 64 | self.logger.info(f"网页被封") 65 | self.buckets.local.set('cookies', self.get_cookies()) 66 | self.queue.freeze(5) 67 | raise errors.RetryError 68 | elif "页面加载中" in response.text or "错误网关" in response.text: 69 | raise errors.RetryError 70 | else: 71 | return response 72 | 73 | 74 | class JobDetails(LaGouJobTask): 75 | """职位详情页面""" 76 | filters = "https://www.lagou.com/jobs/(\d+).html" 77 | 78 | def on_parse(self): 79 | """提取数据""" 80 | try: 81 | publish_time = self.response.css.one(".publish_time").text()[0:-8] 82 | except errors.TaskError as e: 83 | return 84 | 85 | if re.match("(\d+):(\d+)", publish_time) is not None: 86 | publish_time = time.strftime("%Y-%m-%d", time.localtime()) 87 | elif re.match("(\d+)天前", publish_time): 88 | publish_time = (datetime.date.today() - 89 | datetime.timedelta(days=int(publish_time[0]))).strftime('%Y-%m-%d') 90 | status = 0 if self.response.css.one(".send-CV-btn").text() == "投个简历" else 1 91 | result = { 92 | "_id": re.search("(\d+)", self.url).group(0), 93 | "title": self.response.css.one(".job-name > .name").text(), 94 | "label": [label.text() for label in self.response.css.many(".labels")], 95 | "job_request": "".join(request.text() for request in self.response.css.many(".job_request > p > span")), 96 | "advantage": self.response.css.one(".job-advantage > p").text(), 97 | "job_bt": self.response.css.one(".job_bt").text(), 98 | "work_addr": self.response.css.one(".work_addr").text()[0:-8], 99 | "status": status, 100 | "job_company": self.response.css.one("#job_company > dt > a > img").attrs["alt"], 101 | "type": self.response.css.one(".c_feature > li").text(), 102 | "time": publish_time 103 | } 104 | return result 105 | 106 | def on_link(self): 107 | """提取链接""" 108 | job_urls = [] 109 | max_id = self.buckets.share.get("max_id") or 4913130 # 爬虫的最大URL 110 | use_id = self.buckets.share.get("use_id") or -1 # 当前使用的最大URL 111 | waiting_url_const = self.queue.detail["waiting"][1] # 当前职位等待队列中的URL数量 112 | 113 | if use_id >= max_id: 114 | self.queue.interval = 3600 115 | 116 | if waiting_url_const <= 2000: 117 | # 当等待队列中的URL数量少于1000时,添加URL到等待队列中 118 | start_id = use_id + 1 119 | end_id = use_id + 2000 if (use_id + 2000) < max_id else max_id 120 | for path in range(start_id, end_id): 121 | job_urls.append(f"https://www.lagou.com/jobs/{path}.html") 122 | self.buckets.share.set("use_id", use_id + 2000) 123 | 124 | gongsi_urls = [self.response.css.one("#job_company > dt > a").attrs.get("href", None)] 125 | return { 126 | 0: gongsi_urls, 127 | 1: job_urls 128 | } 129 | 130 | def on_save(self): 131 | """保存数据""" 132 | self.logger.info(f"抓到职位信息 {self.result}") 133 | 134 | 135 | class GongSiDetails(LaGouJobTask): 136 | """公司页面详情信息""" 137 | filters = "https://www.lagou.com/gongsi/(\d+).html" 138 | 139 | def on_parse(self): 140 | result = { 141 | "_id": re.search("(\d+)", self.url).group(0), 142 | "company_abbr": self.response.css.one(".hovertips").text(), 143 | "company_full_name": self.response.css.one(".hovertips").attrs["title"], 144 | "type": self.response.css.one(".type + span").text(), 145 | "process": self.response.css.one(".process + span").text(), 146 | "number": self.response.css.one(".number + span").text(), 147 | "address": self.response.css.one(".address + span").default(None).text(), 148 | "label": [label.text() for label in self.response.css.many(".con_ul_li")], 149 | "website": self.response.css.one(".hovertips").attrs.get("href", None) 150 | } 151 | return result 152 | 153 | def on_save(self): 154 | self.logger.info(f"抓到公司信息 {self.result}") 155 | 156 | 157 | class JobsList(LaGouJobTask): 158 | """工作列表,用于增量拉取职位信息""" 159 | filters = "https://www.lagou.com/jobs/positionAjax.json?(\w+)" 160 | 161 | @tasks.retry(-1, 0) 162 | def on_download(self): 163 | try: 164 | response = self.client.get( 165 | url=self.url, 166 | allow_redirects=False, 167 | headers={ 168 | "User-Agent": self.ua.chrome, 169 | "DNT": "1", 170 | "Host": "www.lagou.com", 171 | "Origin": "https://www.lagou.com", 172 | "Referer": "https://www.lagou.com/jobs/list_", 173 | "X-Anit-Forge-Code": "0", 174 | "X-Anit-Forge-Token": None, 175 | "X-Requested-With": "XMLHttpRequest" 176 | } 177 | ) 178 | except (errors.ProxyError, errors.RequestError): 179 | raise errors.RetryError 180 | 181 | if response.json['success'] is False: 182 | self.logger.error(f"列表页发现最新URL出现错误,速率过快") 183 | raise errors.TaskBreak 184 | else: 185 | return response 186 | 187 | def on_parse(self): 188 | """提取信息""" 189 | old_max_id = self.buckets.share.get("max_id") or 0 190 | new_max_id = self.response.json['content']['positionResult']['result'][0]['positionId'] 191 | if old_max_id < new_max_id: 192 | self.buckets.share.set("max_id", new_max_id) 193 | self.queue.interval = 0.01 194 | return [f"https://www.lagou.com/jobs/{new_max_id}.html"] 195 | else: 196 | return [] 197 | 198 | def on_link(self): 199 | """提取链接""" 200 | return { 201 | 1: self.result, 202 | 2: [f"https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false&T={time.time()}"] 203 | } 204 | -------------------------------------------------------------------------------- /spiders/PinDuoDuo/README.md: -------------------------------------------------------------------------------- 1 | ## 拼多多爬虫 2 | 3 | #### 分类商品列表(OperationTask) 4 | 5 | 搜索栏分类商品列表 6 | 7 | | 字段 | 示例 | 说明 | 8 | | ------------------- | ------------------------------------------------------------ | ------------ | 9 | | result.goods_id | 2721076214 | 商品id | 10 | | result.goods_name | 【两件装】秋季男装长袖t恤... | 商品名称 | 11 | | result.thumb_url | http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg | 商品图片链接 | 12 | | result.cnt | 545 | 已售数量 | 13 | | result.normal_price | 2500 | 商品售价 | 14 | | result.market_price | 9900 | 商品标价 | 15 | | result.price | 1280 | 商品拼团价 | 16 | | result.updated_at | 2018-09-02 13:58:08.176553 | 爬取时间 | 17 | 18 | ```python 19 | self.result = [ 20 | { 21 | 'goods_id': 2721076214, 22 | 'goods_name': '【两件装】秋季男装长袖t恤青年韩版潮流上衣学生修身百搭打底衫', 23 | 'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg', 24 | 'cnt': 545, 25 | 'normal_price': 2500, 26 | 'market_price': 9900, 27 | 'price': 1280, 28 | 'updated_at': '2018-09-02 13:58:08.176553' 29 | }, 30 | { 31 | 'goods_id': 142150779, 32 | 'goods_name': '【花花公子贵宾】春夏秋款宽松直筒牛仔裤男弹力休闲商务大码长裤', 33 | 'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-08-14/a18025b1e91445e5ac2acb26be773cd1.jpeg', 34 | 'cnt': 294167, 35 | 'normal_price': 4990, 36 | 'market_price': 29800, 37 | 'price': 2990, 38 | 'updated_at': '2018-09-02 13:58:08.176553' 39 | } 40 | ... 41 | ] 42 | ``` 43 | 44 | 45 | 46 | #### 商品详情(GoodsTask) 47 | 48 | | 字段 | 示例 | 说明 | 49 | | ------------------- | ------------------------------------ | ---------------------------- | 50 | | result.goods_sn | "1805231480604761" | sn码 | 51 | | result.goods_id | 1480604761 | 商品id | 52 | | result.cat_id | 9813 | 搜索id | 53 | | result.goods_name | 【凡爱宝贝】3d立体墙贴自粘... | 名称 | 54 | | result.goods_desc | 【3d立体墙贴】【环保无味】... | 简介 | 55 | | result.market_price | 3500 | 标价 | 56 | | result.is_onsale | 1 | 是否在售,0:下架 1:出售 | 57 | | result.thumb_url | http://t00img.yangkeduo.com/... | 商品图标 | 58 | | result.hd_thumb_url | http://t00img.yangkeduo.com/... | 商品放大图标 | 59 | | result.image_url | http://t00img.yangkeduo.com/... | 商品图片链接 | 60 | | result.price | {"min_on_sale_group_price": 358,...} | 商品价格,见[详情](#price) | 61 | | result.gallery | [{"id": 34954263707,'url':...}] | 商品详情介绍 | 62 | | result.created_at | 1527069514 | 创建时间戳 | 63 | | result.sales | 167701 | 销售量 | 64 | | result.cat_id_list | [9316, 9402, 9813] | 商品多级分类 | 65 | | result.sku | [{"sku_id": 33940681934,...}] | 商品规格详情,见[详情](#sku) | 66 | 67 | ```python 68 | self.result = { 69 | "goods_sn": "1805231480604761", 70 | "goods_id": 1480604761, 71 | "cat_id": 9813, 72 | "goods_name": "【凡爱宝贝】3d立体墙贴自粘防水墙纸防撞壁纸客厅卧室砖纹贴纸", 73 | "goods_desc": "【3d立体墙贴】【环保无味】【无甲醛 免胶自粘】绿色环保、无毒、无味,免人工,带胶撕开底纸即可粘贴,产品粘性强,不易脱落,具有很好的防撞、防水、防潮效果,易遮盖污点,环保无异味,施工简单,规格:70cm宽X77cm高; 工厂直销,砖纹形,装饰儿童房、卧室、客厅背景墙、走廊,也可发挥想象自由裁剪DIY。【计算方式】长x宽=面积,总面积÷单片面积=片数,一片尺寸是70cm宽X77cm高=0.539平方【友情提示】为避免不够,建议需要尽量多买2片备着,因为不同批次颜色有可能存在差异,所以请亲们一次购买足够。", 74 | "market_price": 3500, 75 | "is_onsale": 1, 76 | "thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/f9ba2f52be83d2f55142c55f44ec678c.jpeg", 77 | "hd_thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/8c5790dfea2422328ee3a487f3685ed6.jpeg", 78 | "image_url": "http://t00img.yangkeduo.com/goods/images/2018-07-22/95065d45399fce770bb49de0fba5c590.jpeg", 79 | "goods_type": 1, 80 | "gallery": [ 81 | { 82 | "id": 34954263707, 83 | "url": "http://t00img.yangkeduo.com/t10img/images/2018-07-16/d864ed35818e90521cf858951d9dc349.jpeg" 84 | } 85 | ], 86 | "created_at": 1527069514, 87 | "sales": 167701, 88 | "price": { 89 | "min_on_sale_group_price": 358, 90 | "max_on_sale_group_price": 781, 91 | "min_on_sale_normal_price": 490, 92 | "max_on_sale_normal_price": 1500, 93 | "min_group_price": 358, 94 | "max_group_price": 781, 95 | "max_normal_price": 1500, 96 | "min_normal_price": 490, 97 | "old_min_on_sale_group_price": 390, 98 | "old_max_on_sale_group_price": 860, 99 | "old_min_group_price": 390, 100 | "old_max_group_price": 860 101 | }, 102 | "cat_id_list": [9316, 9402, 9813], 103 | "sku": [ 104 | { 105 | "sku_id": 33940681934, 106 | "goods_id": 1480604761, 107 | "thumb_url": "http://t00img.yangkeduo.com/t07img/images/2018-07-12/94b7c9302b62c64e22914e6e36fb9d40.png", 108 | "quantity": 0, 109 | "normal_price": 1500, 110 | "group_price": 561, 111 | "old_group_price": 610, 112 | "specs": [ 113 | { 114 | "spec_key": "尺寸", 115 | "spec_value": "尺寸70*77厘米/1张" 116 | }, 117 | { 118 | "spec_key": "颜色", 119 | "spec_value": "特价白色(70*77厘米)" 120 | } 121 | ] 122 | } 123 | ] 124 | } 125 | ``` 126 | 127 | + cat_id_list为商品的多级分类栏id,依次为商品一级分类、商品二级分类、商品三级分类 128 | 129 | ### 附录 130 | 131 | #### price 132 | 133 | | 值 | 含义 | 134 | | --------------------------- | ---------------------- | 135 | | min_on_sale_group_price | 在售商品团购最低价 | 136 | | max_on_sale_group_price | 在售商品团购最高价 | 137 | | min_on_sale_normal_price | 在售商品最低价 | 138 | | max_on_sale_normal_price | 在售商品最高价 | 139 | | min_group_price | 商品团购最低价 | 140 | | max_group_price | 商品团购最高价 | 141 | | max_normal_price | 商品最高价 | 142 | | min_normal_price | 商品最低价 | 143 | | old_min_on_sale_group_price | 在售商品团购旧的最低价 | 144 | | old_max_on_sale_group_price | 在售商品团购旧的最高价 | 145 | | old_min_group_price | 商品团购旧的最低价 | 146 | | old_max_group_price | 商品团购旧的最高价 | 147 | 148 | #### sku 149 | 150 | | 值 | 含义 | 151 | | ---------------- | ------------ | 152 | | sku_id | 规格id | 153 | | goods_id | 商品id | 154 | | thumb_url | 规格图片链接 | 155 | | quantity | 数据 | 156 | | normal_price | 标价 | 157 | | group_price | 团购价 | 158 | | old_group_price | 旧的团购价 | 159 | | specs.spec_key | 规格参数 | 160 | | specs.spec_value | 规格参数值 | 161 | 162 | -------------------------------------------------------------------------------- /spiders/PinDuoDuo/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import * 2 | -------------------------------------------------------------------------------- /spiders/PinDuoDuo/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations?pdduid=" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = 0 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.0001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/PinDuoDuo/tasks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | import string 4 | import datetime 5 | from pyloom import tasks 6 | from pyloom.errors import * 7 | 8 | 9 | def get_list_id(opt_id): 10 | """返回list_id:(opt_id)_(10位随机字符串)""" 11 | return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10)) 12 | 13 | 14 | class PinDuoDuoTask(tasks.Task): 15 | """搜索栏""" 16 | _redis = None 17 | goods_url = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/" 18 | operation_url = "http://apiv4.yangkeduo.com/v4/operation/" 19 | 20 | @tasks.retry(tries=5, backoff=0) 21 | def on_download(self): 22 | """下载链接""" 23 | try: 24 | resp = self.client.get( 25 | url=self.url, 26 | headers={ 27 | "User-Agent": self.ua.android, 28 | "Referer": "Android", 29 | "Host": "apiv4.yangkeduo.com" 30 | } 31 | ) 32 | except ProxyError: 33 | self.client.reload_proxy() 34 | raise RetryError 35 | except RequestError: 36 | raise RetryError 37 | 38 | try: 39 | if "error_code" in resp.json: 40 | error_code = resp.json.get('error_code', None) 41 | else: 42 | error_code = None 43 | except JSONDecodeError: 44 | error_code = None 45 | 46 | if error_code == 40001 or resp.status_code == 503 or resp.status_code == 504: 47 | self.client.reuse_proxy() 48 | raise RetryError 49 | 50 | if resp.status_code == 403 or resp.status_code == 429: 51 | self.client.reload_proxy() 52 | raise RetryError 53 | else: 54 | self.client.reuse_proxy() 55 | return resp 56 | 57 | 58 | class HomeOperationTask(PinDuoDuoTask): 59 | """搜索栏""" 60 | filters = "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations\?pdduid=" 61 | 62 | def on_parse(self): 63 | targets = [] 64 | for childrens in self.response.json: 65 | targets.append(childrens["id"]) 66 | for children in childrens["children"]: 67 | targets.append(children["id"]) 68 | return targets 69 | 70 | def on_link(self): 71 | return { 72 | 4: [f"{self.operation_url}{opt_id}/groups?opt_type=2&size=50&offset=0&list_id={get_list_id(opt_id)}&pdduid=" 73 | for opt_id in self.result] 74 | } 75 | 76 | 77 | class OperationTask(PinDuoDuoTask): 78 | """分类商品结果""" 79 | filters = "http://apiv4.yangkeduo.com/v4/operation/(\w+)" 80 | 81 | def on_parse(self): 82 | goods = [] 83 | for good in self.response.json["goods_list"]: 84 | goods.append( 85 | { 86 | "goods_id": good["goods_id"], 87 | "goods_name": good["goods_name"], 88 | "thumb_url": good["thumb_url"], 89 | "cnt": good["cnt"], 90 | "normal_price": good["normal_price"], 91 | "market_price": good["market_price"], 92 | "price": good["group"]["price"], 93 | "updated_at": str(datetime.datetime.now()) 94 | } 95 | ) 96 | operation = { 97 | "goods_id": [good["goods_id"] for good in goods], 98 | "opt_infos": self.response.json["opt_infos"], 99 | "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1], 100 | "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1], 101 | "flip": self.response.json["flip"], 102 | "next_offset": str(self.response.json["flip"]).split(";")[-1] 103 | } 104 | return goods, operation 105 | 106 | def on_link(self): 107 | goods, operation = self.result 108 | 109 | goods_list = [f"{self.goods_url}{goods_id}?goods_id={goods_id}&from=0&pdduid=" 110 | for goods_id in operation["goods_id"]] 111 | operation_list = [f'{self.operation_url}{opt_infos["id"]}/groups?opt_type=2&size=50&offset=0' 112 | f'&list_id={get_list_id(opt_infos["id"])}&pdduid=' 113 | for opt_infos in operation["opt_infos"]] 114 | 115 | if operation["flip"] is not None: 116 | operation_list.append(f'{self.operation_url}{operation["opt_id"]}/groups?opt_type=2&size=50&offset=' 117 | f'{operation["next_offset"]}&list_id={operation["list_id"]}' 118 | f'&flip={operation["flip"]}&pdduid=') 119 | return { 120 | 2: goods_list, 121 | 4: operation_list 122 | } 123 | 124 | def on_save(self): 125 | self.logger.info(f'抓到商品列表 {self.result[0]}') 126 | 127 | 128 | class GoodsTask(PinDuoDuoTask): 129 | """商品详情接口""" 130 | filters = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/(\w+)" 131 | 132 | def on_parse(self): 133 | _sku = self.response.json["sku"] 134 | goods_info = { 135 | "goods_sn": self.response.json["goods_sn"], 136 | "goods_id": self.response.json["goods_id"], 137 | "cat_id": self.response.json["cat_id"], 138 | "goods_name": self.response.json["goods_name"], 139 | "goods_desc": self.response.json["goods_desc"], 140 | "market_price": self.response.json["market_price"], 141 | "is_onsale": self.response.json["is_onsale"], 142 | "thumb_url": self.response.json["thumb_url"], 143 | "hd_thumb_url": self.response.json["hd_thumb_url"], 144 | "image_url": self.response.json["image_url"], 145 | "goods_type": self.response.json["goods_type"], 146 | "gallery": [{"id": gallery["id"], "url":gallery["url"]} for gallery in self.response.json["gallery"]], 147 | "created_at": self.response.json["created_at"], 148 | "sales": self.response.json["sales"], 149 | "price": { 150 | "min_on_sale_group_price": self.response.json["min_on_sale_group_price"], 151 | "max_on_sale_group_price": self.response.json["max_on_sale_group_price"], 152 | "min_on_sale_normal_price": self.response.json["min_on_sale_normal_price"], 153 | "max_on_sale_normal_price": self.response.json["max_on_sale_normal_price"], 154 | "min_group_price": self.response.json["min_group_price"], 155 | "max_group_price": self.response.json["max_group_price"], 156 | "max_normal_price": self.response.json["max_normal_price"], 157 | "min_normal_price": self.response.json["min_normal_price"], 158 | "old_min_on_sale_group_price": self.response.json["old_min_on_sale_group_price"], 159 | "old_max_on_sale_group_price": self.response.json["old_max_on_sale_group_price"], 160 | "old_min_group_price": self.response.json["old_min_group_price"], 161 | "old_max_group_price": self.response.json["old_max_group_price"] 162 | }, 163 | "cat_id_list": [self.response.json["cat_id_1"], 164 | self.response.json["cat_id_2"], 165 | self.response.json["cat_id_3"]] 166 | } 167 | sku = [] 168 | for sku_list in _sku: 169 | sku.append({ 170 | "sku_id": sku_list["sku_id"], 171 | "goods_id": sku_list["goods_id"], 172 | "thumb_url": sku_list["thumb_url"], 173 | "quantity": sku_list["quantity"], 174 | "normal_price": sku_list["normal_price"], 175 | "group_price": sku_list["group_price"], 176 | "old_group_price": sku_list["old_group_price"], 177 | "specs": sku_list["specs"] 178 | }) 179 | goods_info["sku"] = sku 180 | return goods_info 181 | 182 | def on_save(self): 183 | self.logger.info(f'抓到商品信息 {self.result}') 184 | -------------------------------------------------------------------------------- /spiders/PinDuoDuoWEB/README.md: -------------------------------------------------------------------------------- 1 | ## 拼多多爬虫网页版 2 | 3 | #### 分类商品列表(ListTask) 4 | 搜索栏分类商品列表 5 | 6 | | 字段 | 示例 | 说明 | 7 | | ------------------- | ------------------------------------------------------------ | ------------ | 8 | | result.thumb_url | https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg | 商品图片链接 | 9 | | result.country | | 国度 | 10 | | result.goods_name | vivo原装耳机x21 x20... | 商品名称 | 11 | | result.short_name | vivo原装耳机x21 x20... | 商品简称 | 12 | | result.sales_tip | 已拼1490件 | 商品销售提示 | 13 | | result.goods_id | 6636323997 | 商品id | 14 | | result.cnt | 545 | 已售数量 | 15 | | result.normal_price | 2500 | 商品售价 | 16 | | result.market_price | 9900 | 商品标价 | 17 | | result.price | 1280 | 商品拼团价 | 18 | | result.link_url | goods.html?goods_id=6636323997&gallery_id=103375816423 | 商品详情连接 | 19 | | result.mall_name | | 商品店铺名称 | 20 | | result.tag | ['极速退款'] | 商品标签 | 21 | | result.updated_at | 2018-09-02 13:58:08.176553 | 爬取时间 | 22 | 23 | ```python 24 | self.result = [ 25 | { 26 | 'thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg', 27 | 'country': '', 28 | 'goods_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话', 29 | 'short_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话', 30 | 'sales_tip': '已拼1490件', 31 | 'cnt': 1490, 32 | 'goods_id': 6636323997, 33 | 'hd_thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg', 34 | 'hd_url': '', 35 | 'normal_price': 1480, 36 | 'market_price': 9900, 37 | 'price': 1280, 38 | 'link_url': 'goods.html?goods_id=6636323997&gallery_id=103375816423', 39 | 'mall_name': None, 40 | 'tag': ['极速退款'], 41 | 'updated_at': '2019-04-15 23:05:57.603136' 42 | }, 43 | ... 44 | ] 45 | ``` 46 | 47 | ### 商品详情(GoodsTask 48 | 49 | ```python 50 | self.result = { 51 | 'goods': { 52 | 'serverTime': 1555340763, 53 | 'serverTimeTen': 15553407630, 54 | 'allowedRegions': '2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,22,23,24,25,26,27,30,31,32', 55 | 'catID': 5794, 56 | 'country': '', 57 | 'warehouse': '', 58 | 'goodsDesc': '如果你还再为佩戴迷你型双耳容...', 59 | 'goodsID': 2058994703, 60 | 'goodsName': '防水超小无线蓝牙耳机双耳5.0跑步运动一对迷你vivo入耳oppo耳机', 61 | 'shareDesc': '如果你还再为佩戴迷你型双耳容...', 62 | 'goodsType': 1, 63 | 'localGroups': [], 64 | 'hasLocalGroup': 1, 65 | 'bannerHeight': 375, 66 | 'topGallery': [ 67 | '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/strip%7CimageView2/2/w/1300/q/80', 68 | ... 69 | ], 70 | 'viewImageData': [ 71 | '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/quality/70', 72 | ... 73 | ], 74 | 'detailGallery': [ 75 | {'url': '//t00img.yangkeduo.com/t09img/images/2018-07-03/84f1bc3741b3182df6f9d4dae633c9ec.jpeg?imageMogr2/quality/70', 'width': 790, 'height': 790}, 76 | ... 77 | ], 78 | 'videoGallery': [], 79 | 'hasLiveGallery': False, 80 | 'descVideoGallery': [], 81 | 'mallID': 17984, 82 | 'groupTypes': [ 83 | {'requireNum': '1', 'price': '0', 'totalPrice': '0', 'groupID': 2960548556, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999}, 84 | {'requireNum': '2', 'price': '0', 'totalPrice': '0', 'groupID': 2960548557, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999} 85 | ], 86 | 'skus': [ 87 | { 88 | 'skuID': 38010790017, 89 | 'quantity': 158, 90 | 'initQuantity': 0, 91 | 'isOnSale': 1, 92 | 'soldQuantity': 0, 93 | 'specs': [ 94 | {'spec_key': '颜色', 'spec_value': '黑色-支持双耳通话', 'spec_key_id': 1215, 'spec_value_id': 843019793} 95 | ], 96 | 'thumbUrl': '//t00img.yangkeduo.com/t09img/images/2018-07-03/047554b0d2cd49183b5b2ed2380c528a.jpeg', 97 | 'limitQuantity': 999999, 98 | 'normalPrice': '218', 99 | 'groupPrice': '162', 100 | 'oldGroupPrice': '198', 101 | 'skuExpansionPrice': '0', 102 | 'unselectGroupPrice': '0' 103 | }, 104 | ... 105 | ], 106 | 'thumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/a5d77844f14e6438fe196b0d08fd9c63.jpeg', 107 | 'hdThumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/7869f190bfd614a9d3151316f02642a1.jpeg', 108 | 'eventType': 0, 109 | 'isOnSale': True, 110 | 'isGoodsOnSale': True, 111 | 'isSkuOnSale': True, 112 | 'freeCoupon': [], 113 | 'isApp': 0, 114 | 'isFreshmanApp': 0, 115 | 'sideSalesTip': '已拼47件', 116 | 'bottomSalesTip': '', 117 | 'hasAddress': False, 118 | 'catID1': 5752, 119 | 'catID2': 5793, 120 | 'catID3': 5794, 121 | 'catID4': 0, 122 | 'eventComing': False, 123 | 'isMutiGroup': False, 124 | 'isNewUserGroup': False, 125 | 'isSpike': False, 126 | 'isTodaySpike': False, 127 | 'isTomorrowSpike': False, 128 | 'activity': { 129 | 'activityID': 11464199, 130 | 'activityType': 101, 131 | 'startTime': 1554825600, 132 | 'endTime': 1555775999 133 | }, 134 | 'isGroupFree': False, 135 | 'isSpikeComing': False, 136 | 'overseaType': 0, 137 | 'isHaitao': False, 138 | 'isAppNewerJoinGroup': False, 139 | 'countryLogo': '', 140 | 'gpv': None, 141 | 'quickRefund': False, 142 | 'rv': True, 143 | 'maxNormalPrice': '218', 144 | 'minNormalPrice': '218', 145 | 'maxGroupPrice': '162', 146 | 'minGroupPrice': '162', 147 | 'maxOnSaleGroupPrice': '162', 148 | 'minOnSaleGroupPrice': '162', 149 | 'maxOnSaleGroupPriceInCent': 16200, 150 | 'minOnSaleGroupPriceInCent': 16200, 151 | 'maxOnSaleNormalPrice': '218', 152 | 'minOnSaleNormalPrice': '218', 153 | 'minTotalGroupPrice': '324', 154 | 'oldMinOnSaleGroupPriceInCent': 19800, 155 | 'unselectMinGroupPrice': '0', 156 | 'unselectMaxGroupPrice': '0', 157 | 'skipGoodsIDs': ['0'], 158 | 'tag': -1, 159 | 'icon': {'id': 5, 'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/21bdb71af69e346fc73098a23e808656.png', 'width': 116, 'height': 45}, 160 | 'tagIcon': [], 161 | 'isSecondHand': 0, 162 | 'promotionBanner': { 163 | 'id': 1, 164 | 'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 165 | 'default_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 166 | 'new_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 167 | 'url_v2': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 168 | 'url_v2_h': 96, 169 | 'url_v2_w': 750, 170 | 'serverTime': 1555340763 171 | }, 172 | 'isMallDsr': 1, 173 | 'hasPromotion': 1, 174 | 'appClientOnly': 0, 175 | 'isColdGoods': 1, 176 | 'singleCardStatus': 0, 177 | 'singleCardCount': 0, 178 | 'goodsProperty': [ 179 | {'key': '佩戴方式', 'values': ['入耳式']}, 180 | ... 181 | ], 182 | ... 183 | } 184 | ``` 185 | 186 | get_anticontent.js来自https://github.com/SergioJune/Spider-Crack-JS/blob/master/pinduoduo/get_anticontent.js -------------------------------------------------------------------------------- /spiders/PinDuoDuoWEB/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import * -------------------------------------------------------------------------------- /spiders/PinDuoDuoWEB/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "https://mobile.yangkeduo.com/classification.html" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = 0 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.0001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/PinDuoDuoWEB/tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import execjs 4 | import string 5 | import datetime 6 | import re 7 | import os 8 | from pyloom import tasks 9 | from pyloom.errors import * 10 | 11 | 12 | def get_list_id(opt_id): 13 | """返回list_id:(opt_id)_(10位随机字符串)""" 14 | return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10)) 15 | 16 | 17 | def get_anti_content(ua): 18 | path = os.path.abspath(os.path.dirname(__file__)) 19 | with open(os.path.join(path, 'get_anticontent.js'), 'r', encoding='utf-8') as f: 20 | js = f.read() 21 | ctx = execjs.compile(js) 22 | url = "https://mobile.yangkeduo.com/catgoods.html" 23 | return ctx.call('get_anti', url, ua) 24 | 25 | 26 | class SearchTask(tasks.Task): 27 | filters = "https://mobile.yangkeduo.com/classification.html" 28 | 29 | @tasks.retry() 30 | def on_download(self): 31 | try: 32 | ua = self.ua.chrome # 随机获取ua 33 | resp = self.client.get( 34 | url=self.url, 35 | headers={ 36 | "User-Agent": ua, 37 | 'authority': 'mobile.yangkeduo.com', 38 | 'pragma': 'no-cache', 39 | 'cache-control': 'no-cache', 40 | 'upgrade-insecure-requests': '1', 41 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' 42 | 'application/signed-exchange;v=b3', 43 | 'accept-encoding': 'gzip, deflate, br', 44 | 'accept-language': 'zh-CN,zh;q=0.9' 45 | } 46 | ) 47 | except ProxyError: 48 | self.client.reload_proxy() 49 | raise RetryError 50 | except RequestError: 51 | raise RetryError 52 | return resp 53 | 54 | def on_parse(self): 55 | data = json.loads(self.response.re.many("__NEXT_DATA__.*?__NEXT_LOADED_PAGES")[0][16:-20]) 56 | result = [] 57 | for i in data['props']['pageProps']['data']['operationsData']['detailData']: 58 | for j in i['cat']: 59 | result.append(f'https://mobile.yangkeduo.com/proxy/api/v4/operation/{j["optID"]}/groups' 60 | f'?offset=0&size=100&opt_type=2&sort_type=DEFAULT&list_id={get_list_id(j["optID"])}' 61 | f'&pdduid=0') 62 | return result 63 | 64 | def on_link(self): 65 | """解析url,并添加到队列""" 66 | return { 67 | 4: self.result 68 | } 69 | 70 | 71 | class ListTask(tasks.Task): 72 | filters = "https://mobile.yangkeduo.com/proxy/api/v4/operation/(\w+)" 73 | 74 | @tasks.retry() 75 | def on_download(self): 76 | try: 77 | ua = self.ua.chrome # 随机获取ua 78 | url = self.url + f"&anti_content={get_anti_content(ua)}" 79 | resp = self.client.get( 80 | url=url, 81 | headers={ 82 | "User-Agent": ua, 83 | 'authority': 'mobile.yangkeduo.com', 84 | 'pragma': 'no-cache', 85 | 'cache-control': 'no-cache', 86 | 'upgrade-insecure-requests': '1', 87 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' 88 | 'application/signed-exchange;v=b3', 89 | 'accept-encoding': 'gzip, deflate, br', 90 | 'accept-language': 'zh-CN,zh;q=0.9' 91 | } 92 | ) 93 | except ProxyError: 94 | self.client.reload_proxy() 95 | raise RetryError 96 | except RequestError: 97 | raise RetryError 98 | 99 | return resp 100 | 101 | def on_parse(self): 102 | goods = [] 103 | for good in self.response.json["goods_list"]: 104 | goods.append( 105 | { 106 | "thumb_url": good["thumb_url"], 107 | "country": good["country"], 108 | "goods_name": good["goods_name"], 109 | "short_name": good["short_name"], 110 | "sales_tip": good["sales_tip"], 111 | "cnt": good["cnt"], 112 | "goods_id": good["goods_id"], 113 | "hd_thumb_url": good["hd_thumb_url"], 114 | "hd_url": good["hd_url"], 115 | "normal_price": good["normal_price"], 116 | "market_price": good["market_price"], 117 | "price": good["group"]["price"], 118 | "link_url": good["link_url"], 119 | "mall_name": good.get('mall_name'), 120 | "tag": [i["text"] for i in good["tag_list"]], 121 | "updated_at": str(datetime.datetime.now()) 122 | } 123 | ) 124 | operation = { 125 | "link_url": [good["link_url"] for good in goods], 126 | "opt_infos": self.response.json["opt_infos"], 127 | "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1], 128 | "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1], 129 | "flip": self.response.json["flip"], 130 | "next_offset": str(self.response.json["flip"]).split(";")[0] 131 | } 132 | return goods, operation 133 | 134 | def on_link(self): 135 | url = "https://mobile.yangkeduo.com/" 136 | 137 | goods, operation = self.result 138 | goods_list = [f'{url}{link_url}' for link_url in operation["link_url"]] 139 | operation_list = [f'{url}/proxy/api/v4/operation/{opt["id"]}/groups?offset=0&size=100&opt_type=2' 140 | f'&sort_type=DEFAULT&list_id={get_list_id(opt["id"])}&pdduid=0' 141 | for opt in operation["opt_infos"]] 142 | if operation["flip"] is not None: 143 | operation_list.append(f'{url}/proxy/api/v4/operation/{operation["opt_id"]}/groups?opt_type=2&size=100' 144 | f'&offset={operation["next_offset"]}&list_id={operation["list_id"]}' 145 | f'&flip={operation["flip"]}&pdduid=0') 146 | self.logger.debug(goods_list) 147 | self.logger.debug(operation_list) 148 | return { 149 | 1: goods_list, 150 | 4: operation_list 151 | } 152 | 153 | def on_save(self): 154 | self.logger.debug(self.result[0]) 155 | 156 | 157 | class GoodsTask(tasks.Task): 158 | filters = "https://mobile.yangkeduo.com/goods.html" 159 | 160 | @tasks.retry() 161 | def on_download(self): 162 | try: 163 | resp = self.client.get( 164 | url=self.url, 165 | headers={ 166 | "User-Agent": self.ua.chrome, 167 | 'authority': 'mobile.yangkeduo.com', 168 | 'pragma': 'no-cache', 169 | 'cache-control': 'no-cache', 170 | 'upgrade-insecure-requests': '1', 171 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' 172 | 'application/signed-exchange;v=b3', 173 | 'accept-encoding': 'gzip, deflate, br', 174 | 'accept-language': 'zh-CN,zh;q=0.9' 175 | } 176 | ) 177 | except ProxyError: 178 | self.client.reload_proxy() 179 | raise RetryError 180 | except RequestError: 181 | raise RetryError 182 | self.logger.debug(resp.status_code) 183 | 184 | if re.search('"initDataObj":{"needLogin":true}', resp.text) is not None: 185 | raise RetryError 186 | return resp 187 | 188 | def on_parse(self): 189 | data = json.loads(self.response.re.many("window.rawData=.*?}};")[0][15:-1]) 190 | return { 191 | "goods": data["store"]["initDataObj"]["goods"], 192 | "mall": data["store"]["initDataObj"]["mall"], 193 | "reviews": data["store"]["initDataObj"]["reviews"], 194 | } 195 | -------------------------------------------------------------------------------- /spiders/WeiBo/README.md: -------------------------------------------------------------------------------- 1 | ## 微博爬虫 2 | 3 | ### 用户信息(UserTask) 4 | 5 | | 字段 | 示例 | 说明 | 6 | | ------------------------ | ---------------------------------- | ---------------------------------- | 7 | | result.uid | 1680938527 | 用户唯一标识 | 8 | | result.screen_name | 作恶太妖精 | 用户昵称 | 9 | | result.statuses_count | 12275 | 微博数量 | 10 | | result.verified_type | -1 | 账号类型,见[附录](#verified_type) | 11 | | result.verified_type_ext | -1 | 附加账号类型,-1:无 1:橙V 0:金V | 12 | | result.description | 因为追求梦想而伟大!梦想是熬出来的 | 简介 | 13 | | result.gender | f | 性别,f:女 m:男 | 14 | | result.mbtype | 0 | 未知 | 15 | | result.urank | 35 | 账号等级 | 16 | | result.mbrank | 0 | 会员等级 | 17 | | result.followers_count | 754 | 粉丝数量 | 18 | | result.follow_count | 602 | 关注数量 | 19 | | result.profile_image_id | 6431161fjw1e8qgp5bmzyj2050050aa8 | 头像图片号 | 20 | | result.status | 0 | 账号状态,-1:不可用 0:可用 | 21 | | result.updated_at | 2018-08-10 00:02:02 | 抓取时间 | 22 | 23 | ```python 24 | self.result = { 25 | 'uid': 2554193671, 26 | 'screen_name': '黑镜头世界', 27 | 'statuses_count': 88, 28 | 'verified_type': -1, 29 | 'verified_type_ext': -1, 30 | 'description': '一张残旧的老照片,能给你带来灌顶的震撼~', 31 | 'gender': 'm', 32 | 'mbtype': 0, 33 | 'urank': 2, 34 | 'mbrank': 0, 35 | 'followers_count': 84, 36 | 'follow_count': 4, 37 | 'profile_image_id': '983de707jw1e8qgp5bmzyj2050050aa8', 38 | 'status': 0, 39 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 10, 231390) 40 | } 41 | ``` 42 | 43 | 44 | 45 | ### 原创微博(UserTask) 46 | 47 | 每个用户前10条微博中的原创微博 48 | 49 | | 字段 | 示例 | 说明 | 50 | | ---------------------- | ------------------------------------ | ------------ | 51 | | result.mid | 4264355334054790 | 微博唯一标识 | 52 | | result.uid | 1225419417 | 用户唯一标识 | 53 | | result.text | 哇!抽到了!爱国宝 | 微博正文 | 54 | | result.reposts_count | 14 | 转发数量 | 55 | | result.comments_count | 114 | 评论数量 | 56 | | result.attitudes_count | 1481 | 点赞数量 | 57 | | result.source | iPhone X | 来源 | 58 | | result.updated_at | 2018-08-10 00:02:09 | 抓取时间 | 59 | | result.created_at | 2018-07-21 22:56:41 | 发表时间 | 60 | | result.images | ["490a6a99gy1fthvjguf0gj20v91voqbr"] | 图片列表 | 61 | | result.is_long_text | False | 是否为长微博 | 62 | 63 | ```python 64 | self.result = [ 65 | { 66 | 'mid': 4278823505781372, 67 | 'uid': 2094949595, 68 | 'text': '杭州的绿水青山留下了许多诗句,和风熏,杨柳轻,郁郁青山江水平,笑语满香径。什么使你爱上了这座城市?{网页链接}(https://weibo.com/tv/v/Gw6iL1Q0e?fid=1034:4276507087207862) \u200b', 69 | 'reposts_count': 1, 70 | 'comments_count': 1, 71 | 'attitudes_count': 2, 72 | 'source': '微博 weibo.com', 73 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904636), 74 | 'created_at': datetime.datetime(2018, 8, 30, 21, 8, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 75 | 'images': [], 76 | 'is_long_text': False 77 | }, 78 | { 79 | 'mid': 4278785248875113, 80 | 'uid': 2094949595, 81 | 'text': '你当时学的专业是什么?你现在又在做什么工作呢? \u200b', 82 | 'reposts_count': 0, 83 | 'comments_count': 12, 84 | 'attitudes_count': 1, 85 | 'source': '微博 weibo.com', 86 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904846), 87 | 'created_at': datetime.datetime(2018, 8, 30, 18, 36, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 88 | 'images': ['7cde64dbgy1furl2c240jj20e80cujs2'], 89 | 'is_long_text': False 90 | }, 91 | ] 92 | 93 | ``` 94 | 95 | 96 | 97 | ### 转发微博(UserTask) 98 | 99 | 每个用户前10条微博中的转发微博 100 | 101 | | 字段 | 示例 | 说明 | 102 | | ---------------------- | ---------------------- | ------------------------------- | 103 | | result.mid | 4269756171586532 | 微博唯一标识 | 104 | | result.uid | 1680938527 | 用户唯一标识 | 105 | | result.text | //@李宇春如初:转发微博 | 微博正文 | 106 | | result.reposts_count | 0 | 转发数量 | 107 | | result.comments_count | 0 | 评论数量 | 108 | | result.attitudes_count | 0 | 点赞数量 | 109 | | result.source | iPhone客户端 | 来源 | 110 | | result.pmid | 4269752379437757 | 父级微博的mid(上层转发,可空) | 111 | | result.smid | 4269748974496983 | 源微博的mid(原创微博) | 112 | | result.suid | 5427461387 | 源微博的uid | 113 | | result.updated_at | 2018-08-10 00:02:02 | 抓取时间 | 114 | | result.created_at | 2018-08-05 20:37:42 | 发表时间 | 115 | 116 | ```python 117 | self.result = [ 118 | {'mid': 4278871820165470, 119 | 'uid': 1802393212, 120 | 'text': '这壁纸超萌哦,喜欢就快来打call @Line壁纸酱', 121 | 'reposts_count': 0, 122 | 'comments_count': 0, 123 | 'attitudes_count': 2, 124 | 'source': '皮皮时光机', 125 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250057), 126 | 'created_at': datetime.datetime(2018, 8, 31, 0, 20, 2, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 127 | 'pmid': 0, 128 | 'smid': 4278723350035431, 129 | 'suid': 6150916523 130 | }, 131 | { 132 | 'mid': 4278866795185761, 133 | 'uid': 1802393212, 134 | 'text': '[心]', 135 | 'reposts_count': 0, 136 | 'comments_count': 0, 137 | 'attitudes_count': 2, 138 | 'source': '皮皮时光机', 139 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250450), 140 | 'created_at': datetime.datetime(2018, 8, 31, 0, 0, 4, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), 141 | 'pmid': 0, 142 | 'smid': 4266013078506248, 143 | 'suid': 5604000425} 144 | ] 145 | ``` 146 | 147 | 148 | 149 | ### 关注列表(FollowerTask) 150 | 151 | 每个用户最后180个关注、部分大V关注 152 | 153 | | 字段 | 示例 | 说明 | 154 | | ------ | ------------------------ | ----------------------- | 155 | | result | [5427461387, 1680938527] | 关注列表中所有用户的uid | 156 | 157 | ```python 158 | self.result = [ 159 | 1199430302, 5291824241, 1744583555, 1225627080, 1192504311, 1539469391, 1831216671, 1855790127, 160 | ] 161 | ``` 162 | 163 | + 通过self.uid获取当前用户UID 164 | 165 | 166 | 167 | ### 粉丝列表(FanTask) 168 | 169 | 每个用户最后4500个粉丝、部分大V粉丝 170 | 171 | | 字段 | 示例 | 说明 | 172 | | ------ | -------------------------- | ----------------------- | 173 | | result | [5427461387", "1680938527] | 粉丝列表中所有用户的uid | 174 | 175 | ```python 176 | self.result = [ 177 | 2011541160, 6561198332, 5650361179, 5203386014, 6586203686, 3975892466, 5280555723, 6200526771, 178 | ] 179 | ``` 180 | 181 | + 通过self.uid获取当前用户UID 182 | 183 | 184 | 185 | ### 附录 186 | 187 | #### verified_type 188 | 189 | |值|含义| 190 | |:---|:---| 191 | |-1|无认证| 192 | |0|个人认证| 193 | |1|政府| 194 | |2|企业| 195 | |3|媒体| 196 | |4|校园| 197 | |5|网站| 198 | |6|应用| 199 | |7|机构| 200 | |8|待审企业| 201 | |200|初级达人| 202 | |220|中高级达人| 203 | |400|已故V用户| 204 | 205 | -------------------------------------------------------------------------------- /spiders/WeiBo/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import * 2 | -------------------------------------------------------------------------------- /spiders/WeiBo/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "user:1111681197", 4 | "user:1863847262" 5 | ] 6 | # 调度间隔时间(秒) 7 | # 控制当前爬虫的抓取频率 8 | interval = 0.2 9 | # 任务超时时间(秒) 10 | # 超时后,将被移入tag='timeout'的异常队列中 11 | timeout = 120 12 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 13 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 14 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 15 | # 特别注意,此字段一经设置不可更改 16 | precision = 0.00001 17 | # 自定义参数 18 | # Task中使用self.args访问这里的args 19 | args = {} 20 | -------------------------------------------------------------------------------- /spiders/WeiBo/tasks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import datetime 3 | import itertools 4 | from pyloom import tasks 5 | from pyloom.errors import * 6 | 7 | 8 | class PWATask(tasks.Task): 9 | _redis = None 10 | 11 | def __init__(self, *args, **kwargs): 12 | super(PWATask, self).__init__(*args, **kwargs) 13 | self.uid = self.url.split(":")[1] 14 | self.client.headers = { 15 | 'Accept': 'application/json, text/plain, */*', 16 | 'Referer': f'https://m.weibo.cn/profile/{self.uid}', 17 | 'MWeibo-Pwa': '1', 18 | 'X-Requested-With': 'XMLHttpRequest', 19 | 'User-Agent': self.ua.chrome 20 | } 21 | 22 | @tasks.retry(tries=16, delay=0, backoff=0) 23 | def download(self, url, params): 24 | """下载并判断是否被封禁""" 25 | try: 26 | resp = self.client.get(url, params, timeout=8) 27 | except (ProxyError, RequestError) as e: 28 | if self.download.count >= 3: # 重试两次后更换代理 29 | self.logger.debug("请求错误", e) 30 | self.client.reload_proxy() 31 | raise RetryError 32 | try: 33 | errno = resp.json.get('errno', None) 34 | except JSONDecodeError: 35 | errno = None 36 | if resp.status_code == 418 or (resp.status_code == 403 and errno == "100005"): 37 | self.logger.debug("响应包错误,IP已被封禁") 38 | self.client.reuse_proxy(150) 39 | self.client.reload_proxy() 40 | raise RetryError 41 | elif errno == "20003": 42 | self.logger.debug("响应包错误,用户不存在", self.uid) 43 | raise TaskFinish() 44 | elif resp.status_code != 200: 45 | self.logger.debug(f"响应包错误,状态码:{resp.status_code}", self.uid) 46 | if self.download.count >= 3: 47 | self.client.reload_proxy() 48 | raise RetryError 49 | elif errno is not None: 50 | msg = resp.json['msg'] 51 | self.logger.debug(f"响应包错误,errno={errno},msg={msg}", self.uid) 52 | raise TaskError("msg:" + msg) 53 | else: 54 | self.client.reuse_proxy(0) 55 | return resp 56 | 57 | 58 | class UserTask(PWATask): 59 | """用户资料""" 60 | filters = "user:\w+" 61 | 62 | def on_download(self): 63 | return self.download('https://m.weibo.cn/profile/info', {'uid': self.uid}) 64 | 65 | def parse_text(self, _text): 66 | """转换微博内容以节约空间""" 67 | 68 | # 将表情包转为:[拜拜] 69 | def replacer_first(match): 70 | return match.groups()[0] 71 | 72 | text = re.sub( 73 | r']+class="url-icon">\s*([^', replacer_first, _text 74 | ) 75 | 76 | # 将链接转为:{title}(url) 77 | # title一般为话题 78 | def replacer_link(match): 79 | groups = match.groups() 80 | return f"{{{groups[1]}}}({groups[0]})" 81 | 82 | text = re.sub( 83 | r']+href="([^"]+)".*?>\s*([^<>]+)\s*', 84 | replacer_link, text 85 | ) 86 | # 将@连接转为: @XXX 87 | text = re.sub(r']+>(@[^<>]+)', replacer_first, text) 88 | # 将
转为\n 89 | text = text.replace("
", "\n") 90 | return text 91 | 92 | def parse_status(self, _status): 93 | """递归提取源微博与被转发微博""" 94 | status = { 95 | 'mid': int(_status['id']), 96 | 'uid': _status['user']['id'], 97 | 'text': self.parse_text(_status['text']), 98 | 'reposts_count': _status['reposts_count'], 99 | 'comments_count': _status['comments_count'], 100 | 'attitudes_count': _status['attitudes_count'], 101 | 'source': _status['source'], 102 | 'updated_at': datetime.datetime.now(), 103 | 'created_at': datetime.datetime.strptime( 104 | _status['created_at'], "%a %b %d %H:%M:%S %z %Y"), 105 | } 106 | retweeted_status = _status.get('retweeted_status', None) 107 | if retweeted_status: # 转发 108 | status['pmid'] = _status.get('pid', 0) 109 | status['smid'] = int(retweeted_status['id']) 110 | status['suid'] = int(retweeted_status['user']['id']) 111 | repost = status 112 | status, _ = self.parse_status(retweeted_status) 113 | return status, repost 114 | else: # 原创 115 | status['images'] = _status['pic_ids'] 116 | status['is_long_text'] = _status['isLongText'] 117 | return status, None 118 | 119 | def on_parse(self): 120 | # 用户信息 121 | _user = self.response.json['data']['user'] 122 | user = { 123 | 'uid': _user['id'], 124 | 'screen_name': _user['screen_name'], 125 | 'statuses_count': _user['statuses_count'], 126 | 'verified_type': _user['verified_type'], 127 | 'verified_type_ext': _user.get('verified_type_ext', -1), 128 | 'description': _user['description'], 129 | 'gender': _user['gender'], 130 | 'mbtype': _user['mbtype'], 131 | 'urank': _user['urank'], 132 | 'mbrank': _user['mbrank'], 133 | 'followers_count': _user['followers_count'], 134 | 'follow_count': _user['follow_count'], 135 | 'profile_image_id': _user['profile_image_url'].rsplit("/", 1)[1].split(".")[0], 136 | 'status': 0, 137 | 'updated_at': datetime.datetime.now() 138 | } 139 | # 最近微博 140 | statuses = [] 141 | reposts = [] 142 | for _status in self.response.json['data']['statuses']: 143 | status, repost = self.parse_status(_status) 144 | if status: 145 | statuses.append(status) 146 | if repost: 147 | reposts.append(repost) 148 | 149 | return user, statuses, reposts 150 | 151 | def on_link(self): 152 | return { 153 | 3: [f'follow:{self.uid}'], 154 | 4: [f'fan:{self.uid}'] 155 | } 156 | 157 | def on_save(self): 158 | self.logger.info("抓到用户信息", self.result[0]) 159 | if self.result[1]: 160 | self.logger.info("抓到原创微博", self.result[1]) 161 | if self.result[2]: 162 | self.logger.info("抓到转发微博", self.result[2]) 163 | 164 | 165 | class ContainerTask(PWATask): 166 | """解析关注和粉丝列表的响应包""" 167 | 168 | def on_parse(self): 169 | targets = [] 170 | for page in self.response: 171 | cards = page.json['data']['cards'] 172 | for card in cards: 173 | style = card.get('card_style', None) 174 | group = card['card_group'] 175 | if style is None: # 普通用户 176 | targets.extend(g['user']['id'] for g in group) 177 | elif style == 1: # 推荐用户 178 | if len(group) == 3 and 'scheme' in group[2]: # 相关大V用户 179 | if 'users' in group[1]: 180 | ids = [user['id'] for user in group[1]['users']] 181 | elif 'user' in group[1]: 182 | ids = [group[1]['user']['id']] 183 | else: 184 | ids = [] 185 | else: # 大V用户 186 | ids = [g['user']['id'] for g in group if 'user' in g] 187 | targets.extend(ids) 188 | else: 189 | raise TaskError(f"card_style={style}") 190 | _targets = [] 191 | for t in targets: 192 | try: 193 | _targets.append(int(t)) 194 | except ValueError: 195 | pass 196 | return _targets 197 | 198 | def on_link(self): 199 | return {1: [f"user:{uid}" for uid in self.result]} if self.result else {} 200 | 201 | 202 | class FollowerTask(ContainerTask): 203 | """关注列表""" 204 | filters = "follow:\w+" 205 | 206 | def on_download(self): 207 | pages = [] 208 | url = "https://m.weibo.cn/api/container/getIndex" 209 | for page_id in itertools.count(1): 210 | params = {"containerid": f"231051_-_followers_-_{self.uid}"} 211 | if page_id != 1: 212 | params['page'] = page_id 213 | resp = self.download(url, params) 214 | if resp.json['ok'] == 0: # 已到最后一页 215 | break 216 | pages.append(resp) 217 | return pages 218 | 219 | def on_save(self): 220 | self.logger.info("抓到关注列表", self.result) 221 | 222 | 223 | class FanTask(ContainerTask): 224 | """粉丝列表""" 225 | filters = "fan:\w+" 226 | 227 | def on_download(self): 228 | pages = [] 229 | url = "https://m.weibo.cn/api/container/getIndex" 230 | for since_id in itertools.count(1): 231 | params = {"containerid": f"231051_-_fans_-_{self.uid}"} 232 | if since_id != 1: 233 | params['since_id'] = since_id 234 | resp = self.download(url, params) 235 | if resp.json['ok'] == 0: # 已到最后一页 236 | break 237 | pages.append(resp) 238 | return pages 239 | 240 | def on_save(self): 241 | self.logger.info("抓到粉丝列表", self.result) 242 | -------------------------------------------------------------------------------- /spiders/Ziroom/README.md: -------------------------------------------------------------------------------- 1 | ## 自如爬虫 2 | 3 | ### 安装 4 | 5 | 1. [安装tesseract-ocr](https://github.com/tesseract-ocr/tesseract/wiki) 6 | 7 | 2. 安装pytesseract库 8 | 9 | ``` 10 | pip install pytesseract 11 | ``` 12 | 13 | 14 | ### 房源列表(NLTask) 15 | 16 | 房源列表中的房源信息 17 | 18 | | 字段 | 示例 | 说明 | 19 | | ---------------- | ----------------------------------------------------- | ------------ | 20 | | result.price | 1160 | 价格 | 21 | | result.href | www.ziroom.com/z/vr/61441027.html | 详情信息链接 | 22 | | result.img_src | static8.ziroom.com/phoenix/pc/images/list/loading.jpg | 图片 | 23 | | result.block | 天恒乐活城 | 小区名 | 24 | | result.name | 整租 · 天恒乐活城2居室-南 | 房源名 | 25 | | result.site | [通州通州其它] 亦庄线次渠南 | 位置 | 26 | | result.detail | 14.1 ㎡\|6/6层\|3室1厅距15号线石门站690米 | 细节 | 27 | | result.room_tags | ['离地铁近', '独立阳台', '集体供暖', '友家3.0 木棉'] | 标签 | 28 | 29 | ```python 30 | self.result = [ 31 | { 32 | 'price': '1830', 33 | 'href': 'www.ziroom.com/z/vr/61514855.html', 34 | 'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg', 35 | 'block': '世茂维拉', 'name': '友家 · 世茂维拉5居室-南卧', 36 | 'site': '[房山长阳] 房山线广阳城', 37 | 'detail': '12.3 ㎡|5/5层|5室1厅距房山线广阳城站696米有2间空房', 38 | 'room_tags': ['离地铁近', '独卫', '集体供暖', '友家4.0 拿铁'] 39 | }, 40 | { 41 | 'price': '1830', 42 | 'href': 'www.ziroom.com/z/vr/261810.html', 43 | 'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg', 44 | 'block': '前进花园石门苑', 45 | 'name': '友家 · 前进花园石门苑3居室-南卧', 46 | 'site': '[顺义顺义城] 15号线石门', 47 | 'detail': '14.1 ㎡|6/6层|3室1厅距15号线石门站690米', 48 | 'room_tags': ['离地铁近', '独立阳台', '集体供暖', '友家4.0 布丁'] 49 | }, 50 | ... 51 | ] 52 | ``` 53 | 54 | 55 | 56 | ### 房源详情(VRTask) 57 | 58 | | 字段 | 示例 | 说明 | 59 | | ------------------------ | -------------------------------------------------------- | ------------------------------------------ | 60 | | result.img | ['http://pic.ziroom.com/house_images.jpg',...] | 介绍图片 | 61 | | result.room_name | 和平家园小区4居室-02卧 | 名称 | 62 | | result.ellipsis | [昌平 沙河] 昌平线 昌平 | 位置 | 63 | | result.room_id | 61264525 | 房间id | 64 | | result.house_id | 60203175 | 房屋id | 65 | | result.current_city_code | 110000 | 所在城市编码,见[附录](#current_city_code) | 66 | | result.detail_room | {'面积': '15.4㎡', '朝向': '南', '户型': '4室1厅合',...} | 房屋参数 | 67 | | result.number | BJZRGY0818215849_02 | 编号 | 68 | | result.periphery | 学校:中国政法大学法学院、中国石油大学... | 周边 | 69 | | result.traffic | 公交:314路、昌平9路... | 交通 | 70 | | result.configuration | ['bed', 'desk', 'chest', 'calorifier'] | 配置 | 71 | | result.roommate | [{性别': 'man', '房间号': '01卧', '星座': '天蝎座',...}] | 室友信息 | 72 | | result.price | 1890 | 价格 | 73 | 74 | ```python 75 | self.result = { 76 | 'img':[ 77 | 'http://pic.ziroom.com/house_images/g2m1/M00/5B/DF/v180x135.jpg', 78 | 'http://pic.ziroom.com/house_images/g2m1/M00/5D/0B/v180x135.jpg' 79 | ], 80 | 'room_name': '和平家园小区4居室-02卧', 81 | 'ellipsis': '[昌平 沙河] 昌平线 昌平', 82 | 'room_id': '61264525', 83 | 'house_id': '60203175', 84 | 'current_city_code': '110000', 85 | 'detail_room': { 86 | '面积': '15.4㎡', 87 | '朝向': '南', 88 | '户型': '4室1厅合', 89 | '楼层': '6/6层', 90 | '交通': '距15号线石门307米距15号线顺义1621米距15号线南法信2290米' 91 | }, 92 | 'number': 'BJZRGY0818215849_02', 93 | 'periphery': '学校:中国政法大学法学院 医院:北京化工大学校医院', 94 | 'traffic': '公交:314路、昌平9路、914路、昌平3路、昌平5路、326路、345路', 95 | 'configuration': ['bed', 'desk', 'chest', 'calorifier], 96 | 'roommate':[ 97 | { 98 | '性别': 'man', 99 | '房间号': '01卧', 100 | '星座': '天蝎座', 101 | '职业': '产品', 102 | '入住时间': '2018/07-2019/07' 103 | }, 104 | { 105 | '性别': 'current', 106 | '房间号': '02卧', 107 | '星座': '…', 108 | '职业': '…', 109 | '入住时间': '…' 110 | } 111 | ], 112 | 'price': 1890 113 | } 114 | ``` 115 | 116 | + 房间室友(result.roommate)不存在时,性别为current,'…'代表信息为空 117 | 118 | ### 附录 119 | 120 | #### current_city_code 121 | 122 | | 城市编码 | 城市名称 | 123 | | -------- | -------- | 124 | | 110000 | 北京 | 125 | | 310000 | 上海 | 126 | | 440300 | 深圳 | 127 | | 330100 | 杭州 | 128 | | 320100 | 南京 | 129 | | 440100 | 广州 | 130 | | 510100 | 成都 | 131 | | 420100 | 武汉 | 132 | | 120000 | 天津 | 133 | 134 | -------------------------------------------------------------------------------- /spiders/Ziroom/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import * 2 | -------------------------------------------------------------------------------- /spiders/Ziroom/configs.py: -------------------------------------------------------------------------------- 1 | # 爬虫初始化时填入队列的种子页面 2 | seeders = [ 3 | "http://www.ziroom.com/z/nl/z3.html" 4 | ] 5 | # 调度间隔时间(秒) 6 | # 控制当前爬虫的抓取频率 7 | interval = 0 8 | # 任务超时时间(秒) 9 | # 超时后,将被移入tag='timeout'的异常队列中 10 | timeout = 120 11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取 12 | # 若精度设置过低,会造成过多的页面被误报为'已完成' 13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改 14 | # 特别注意,此字段一经设置不可更改 15 | precision = 0.0001 16 | # 自定义参数 17 | # Task中使用self.args访问这里的args 18 | args = {} 19 | -------------------------------------------------------------------------------- /spiders/Ziroom/tasks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import requests 4 | import pytesseract 5 | import urllib.parse 6 | from PIL import Image 7 | from io import BytesIO 8 | from pyloom import tasks 9 | from pyloom.errors import * 10 | 11 | 12 | class ZiRoomTask(tasks.Task): 13 | @tasks.retry(5, 0) 14 | def on_download(self): 15 | # 解决列表页第一页冲突问题 16 | page = re.search('\\?p=1', self.url) 17 | if page is not None: 18 | self.client.reuse_proxy() 19 | raise TaskFinish 20 | try: 21 | response = self.client.get( 22 | url=self.url, 23 | allow_redirects=False, 24 | headers={ 25 | "User-Agent": self.ua.chrome 26 | } 27 | ) 28 | except (ProxyError, RequestError) as e: 29 | if self.on_download.count >= 5: # 重试两次后更换代理 30 | self.logger.debug("请求错误", e) 31 | self.client.reload_proxy() 32 | raise RetryError 33 | 34 | if "请核对您输入的页面地址是否正确" in response.text or "The requested URL could not be retrieved" in response.text: 35 | if self.on_download.count >= 5: # 重试两次后更换代理 36 | self.logger.info("请求次数", self.on_download.count) 37 | self.client.reload_proxy() 38 | else: 39 | self.client.reuse_proxy(5) 40 | raise RetryError 41 | if response.status_code == 302: 42 | if self.on_download.count >= 5: # 重试两次后更换代理 43 | self.logger.info("请求次数", self.on_download.count) 44 | self.client.reload_proxy() 45 | else: 46 | self.client.reuse_proxy(5) 47 | raise RetryError 48 | if response.status_code == 500: 49 | raise RetryError 50 | self.client.reuse_proxy() 51 | return response 52 | 53 | @staticmethod 54 | def get_price(response): 55 | """通过图像匹配返回房租价格""" 56 | image_url = re.search('static8.ziroom.com/phoenix/pc/images/price/(\w+).png', response.text)[0] 57 | image = Image.open(BytesIO(requests.get(f'http://{image_url}').content)) 58 | digital_table = pytesseract.image_to_string(image, config='--psm 7') 59 | offset_list = re.search('\\[((,)?\\[(\w)(,\w)+\\])+\\]', response.text)[0] 60 | price_list = [] 61 | for offset in offset_list[2:-2].split('],['): 62 | a = "" 63 | for offset_num in offset.split(','): 64 | a = a + (digital_table[int(offset_num)]) 65 | price_list.append(a) 66 | return price_list 67 | 68 | 69 | class NLTask(ZiRoomTask): 70 | filters = "http://(\w+).ziroom.com/z/nl/\S+" 71 | 72 | def on_parse(self): 73 | """解析链接""" 74 | house_list = self.response.css.many('#houseList li') 75 | houses = [] 76 | if self.response.css.one('.nomsg').default(None).text() is None: 77 | price_list = self.get_price(self.response) 78 | for house in house_list: 79 | houses.append( 80 | { 81 | 'price': price_list[len(houses)], 82 | 'href': house.one('.img a').attrs['href'][2:], 83 | 'img_src': house.one('.img a img').attrs['src'][2:], 84 | 'block': house.one('.img a img').attrs.get('alt', None), 85 | 'name': house.one('.txt h3 a').text(), 86 | 'site': house.one('.txt h4 a').text(), 87 | 'detail': house.one('.txt .detail').text(), 88 | 'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')] 89 | } 90 | ) 91 | else: 92 | for house in house_list: 93 | houses.append( 94 | { 95 | 'price': re.search("(\d+)", house.one('.price').text())[0], 96 | 'href': house.one('.img a').attrs['href'][2:], 97 | 'img_src': house.one('.img a img').attrs['src'][2:], 98 | 'block': house.one('.img a img').attrs.get('alt', None), 99 | 'name': house.one('.txt h3 a').text(), 100 | 'site': house.one('.txt h4 a').text(), 101 | 'detail': house.one('.txt .detail').text(), 102 | 'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')] 103 | } 104 | ) 105 | return houses 106 | 107 | def on_link(self): 108 | paths = list(set(self.response.re.many('\w+.ziroom.com/z/nl/\S+?.html\\??p?=?\d*'))) 109 | return { 110 | 2: [f'http://{house["href"]}' for house in self.result], 111 | 4: [f'http://{path}' for path in paths] 112 | } 113 | 114 | def on_save(self): 115 | self.logger.info(f'抓到房源列表 {self.result}') 116 | 117 | 118 | class VRTask(ZiRoomTask): 119 | filters = "http://(\w+).ziroom.com/z/vr/(\w+)" 120 | 121 | def on_parse(self): 122 | detail_room = {} 123 | for i in self.response.css.many(".detail_room li"): 124 | detail = re.sub('\s', '', i.text()).split(':') 125 | detail_room[detail[0]] = detail[1] 126 | info = { 127 | 'img': [img.attrs['src'] for img in self.response.css.many('.lidiv img')], 128 | 'room_name': self.response.css.one('.room_name h2').default(None).text(), 129 | 'ellipsis': ' '.join(filter( 130 | lambda x: x, self.response.css.one('.room_detail_right .ellipsis').text().split())), 131 | 'room_id': self.response.css.one('#room_id').attrs.get("value"), 132 | 'house_id': self.response.css.one('#house_id').attrs.get("value"), 133 | 'current_city_code': self.response.css.one('#current_city_code').attrs.get('value'), 134 | 'detail_room': detail_room, 135 | 'number': self.response.css.one('.aboutRoom h3').text()[3:], 136 | 'periphery': self.response.css.many('.aboutRoom p')[0].text()[3:], 137 | 'traffic': self.response.css.many('.aboutRoom p')[1].text()[3:] 138 | } 139 | roommate = [] 140 | for i in self.response.css.many('.greatRoommate li'): 141 | roommate.append({ 142 | '性别': i.attrs.get("class")[0], 143 | '房间号': i.one('.user_top p').text(), 144 | '星座': i.one('.sign').text()[0:-2], 145 | '职业': i.one('.jobs').text()[0:-2], 146 | '入住时间': i.one('.user_bottom p').text(), 147 | }) 148 | info['roommate'] = roommate 149 | conf = self.client.get( 150 | url=f"http://www.ziroom.com/detail/config?house_id={info['house_id']}&id={info['room_id']}", 151 | headers={ 152 | "User-Agent": self.ua.chrome 153 | } 154 | ) 155 | configuration = [] 156 | for i in conf.json['data']: 157 | if conf.json['data'].get(i) == 1: 158 | configuration.append(i) 159 | info['configuration'] = configuration 160 | cookies = self.response.cookies.get_dict() 161 | for cookie in cookies: 162 | if 'nlist' in cookie: 163 | info['price'] = json.loads(urllib.parse.unquote( 164 | self.response.cookies.get_dict()[cookie]))[info["room_id"]]['sell_price'] 165 | break 166 | return info 167 | 168 | def on_save(self): 169 | self.logger.info(f'抓到房源信息 {self.result}') 170 | --------------------------------------------------------------------------------