├── pyloom
├── __init__.py
├── __main__.py
├── errors.py
├── lua
│ ├── url_pop.lua
│ ├── url_add.lua
│ ├── bloom_check.lua
│ └── bloom_cas.lua
├── drivers.py
├── proxy.py
├── buckets.py
├── worker.py
├── utils.py
├── scheduler.py
├── tasks.py
├── entry.py
└── user-agent.json
├── spiders
├── PinDuoDuoWEB
│ ├── __init__.py
│ ├── configs.py
│ ├── tasks.py
│ └── README.md
├── WeiBo
│ ├── __init__.py
│ ├── configs.py
│ ├── README.md
│ └── tasks.py
├── Ziroom
│ ├── __init__.py
│ ├── configs.py
│ ├── README.md
│ └── tasks.py
├── DouBanBooks
│ ├── __init__.py
│ ├── configs.py
│ ├── README.md
│ └── tasks.py
├── PinDuoDuo
│ ├── __init__.py
│ ├── configs.py
│ ├── README.md
│ └── tasks.py
├── DouBan250
│ ├── README.md
│ ├── configs.py
│ └── __init__.py
└── LaGou
│ ├── __init__.py
│ ├── configs.py
│ ├── README.md
│ └── tasks.py
├── setup.py
├── README.md
└── .gitignore
/pyloom/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import Task
2 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
--------------------------------------------------------------------------------
/spiders/WeiBo/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 |
--------------------------------------------------------------------------------
/spiders/Ziroom/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 |
--------------------------------------------------------------------------------
/spiders/DouBanBooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuo/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 |
--------------------------------------------------------------------------------
/spiders/DouBan250/README.md:
--------------------------------------------------------------------------------
1 | ## 豆瓣TOP250电影爬虫
2 |
3 | 用于演示编写最基本的爬虫、测试新功能。
--------------------------------------------------------------------------------
/pyloom/__main__.py:
--------------------------------------------------------------------------------
1 | """作为模块启动"""
2 | if __name__ == '__main__':
3 | from .entry import main
4 |
5 | main()
6 |
--------------------------------------------------------------------------------
/spiders/LaGou/__init__.py:
--------------------------------------------------------------------------------
1 | import time
2 | from .tasks import *
3 |
4 |
5 | def reactor():
6 | return time.time()
7 |
--------------------------------------------------------------------------------
/spiders/DouBan250/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "https://movie.douban.com/top250"
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = 5
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/spiders/Ziroom/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "http://www.ziroom.com/z/nl/z3.html"
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = 0
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/spiders/DouBanBooks/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "https://book.douban.com/tag/?view=cloud"
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = -10
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/spiders/WeiBo/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "user:1111681197",
4 | "user:1863847262"
5 | ]
6 | # 调度间隔时间(秒)
7 | # 控制当前爬虫的抓取频率
8 | interval = 0.2
9 | # 任务超时时间(秒)
10 | # 超时后,将被移入tag='timeout'的异常队列中
11 | timeout = 120
12 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
13 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
14 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
15 | # 特别注意,此字段一经设置不可更改
16 | precision = 0.00001
17 | # 自定义参数
18 | # Task中使用self.args访问这里的args
19 | args = {}
20 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "https://mobile.yangkeduo.com/classification.html"
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = 0
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuo/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations?pdduid="
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = 0
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/spiders/LaGou/configs.py:
--------------------------------------------------------------------------------
1 | # 爬虫初始化时填入队列的种子页面
2 | seeders = [
3 | "https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false"
4 | ]
5 | # 调度间隔时间(秒)
6 | # 控制当前爬虫的抓取频率
7 | interval = 3600
8 | # 任务超时时间(秒)
9 | # 超时后,将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度,用于过滤'已完成'的URL,避免重复抓取
12 | # 若精度设置过低,会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗,酌情更改
14 | # 特别注意,此字段一经设置不可更改
15 | precision = 0.001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 |
3 | setup(
4 | name='pyloom',
5 | version='0.0.7',
6 | packages=['pyloom'],
7 | url='https://pyloom.com',
8 | license='https://opensource.org/licenses/MIT',
9 | author='pyloom',
10 | author_email='ss@uutoto.com',
11 | description='古老的东方有一条虫,它的名字叫爬龙。',
12 | entry_points={
13 | 'console_scripts': [
14 | 'pyloom = pyloom.entry:main'
15 | ]
16 | },
17 | install_requires=[
18 | 'redis',
19 | 'cryptography >= 2.2.1',
20 | 'requests[security, socks] >= 2.10.0',
21 | 'bs4',
22 | 'lxml',
23 | 'furl',
24 | 'simplejson',
25 | 'checksumdir',
26 | 'docutils', # python-daemon的依赖
27 | 'python-daemon',
28 | 'tabulate',
29 | 'psutil'
30 | ]
31 | )
32 |
--------------------------------------------------------------------------------
/spiders/DouBan250/__init__.py:
--------------------------------------------------------------------------------
1 | from pyloom.tasks import *
2 |
3 |
4 | class DouBanTask(Task):
5 | filters = ["^https://movie.douban.com/top250(\?start=\d+)?$"]
6 |
7 | def on_download(self):
8 | return self.client.get(
9 | url=self.url,
10 | headers={
11 | "Host": "movie.douban.com",
12 | "User-Agent": self.ua.chrome
13 | }
14 | )
15 |
16 | def on_parse(self):
17 | nodes = self.response.css.many("div.article ol > li")
18 | return [{
19 | "title": node.one("span.title").text(),
20 | "rating": node.one("span.rating_num").text(),
21 | "quote": node.one("p.quote > span.inq").text()
22 | } for node in nodes]
23 |
24 | def on_link(self):
25 | if self.url.endswith("top250"):
26 | return [f"{self.url}?start={i}" for i in range(25, 250, 25)]
27 |
28 | def on_save(self):
29 | for movie in self.result:
30 | self.logger.info("抓到电影", movie)
31 |
--------------------------------------------------------------------------------
/spiders/DouBanBooks/README.md:
--------------------------------------------------------------------------------
1 | ## 豆瓣图书爬虫
2 |
3 | ### 图书信息(BookDetailsTask)
4 |
5 | | 字段 | 示例 | 说明 |
6 | | -------------------- | ------------------------------------------------------------ | -------- |
7 | | result.title | 社会研究方法 | 书名 |
8 | | result.cover | https://img3.doubanio.com/view/subject/l/public/s2932505.jpg | 封面 |
9 | | result.info | {'作者': '[美]劳伦斯·纽曼', '出版社', '中国人民大学出版社'} | 基本信息 |
10 | | result.rating_num | 9.0 | 评分 |
11 | | result.rating_people | 202 | 评分人数 |
12 | | result.intro | 迄今所见中文社会研究方法书中最好的…… | 简介 |
13 | | result.tags | ['社会学', '研究方法'] | 标签 |
14 |
15 | ```python
16 | self.result = {
17 | 'title': '社会研究方法',
18 | 'cover': 'https://img3.doubanio.com/view/subject/l/public/s2932505.jpg',
19 | 'info': {
20 | '作者': '[美]劳伦斯·纽曼',
21 | '出版社': '中国人民大学出版社',
22 | '副标题': '定性和定量的取向',
23 | '原作名': 'Basics of Social Research: Qualitative and Quantitative Approaches',
24 | '译者': '郝大海',
25 | '出版年': '2007',
26 | '页数': '809',
27 | '定价': '89.80元',
28 | '丛书': '社会学译丛·经典教材系列',
29 | 'ISBN': '9787300075648'
30 | },
31 | 'rating_num': '9.0',
32 | 'rating_people': '202',
33 | 'intro': '迄今所见中文社会研究方法书中最好的一本,极力推荐研究生教学中采用。理清了许多问题,对定性和定量的对比非常精彩。',
34 | 'tags': ['社会学', '研究方法', '方法论', '社会研究方法', '定性', '教材', '纽曼', '定量']
35 | }
36 | ```
37 |
38 | + 基本信息(result.info),每本书的字段不固定。
39 |
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyLoom,爬龙!
2 |
3 | PyLoom想为有价值的网站编写爬虫,让开发者便捷地获取结构化的数据。
4 |
5 | PyLoom由三个部分组成,
6 |
7 | 1. 框架,减少编写、运行、维护爬虫的工作量。
8 |
9 | 2. 爬虫,寻找有价值的目标为其开发爬虫,并维护既有爬虫的可用性。
10 |
11 | 预期19年底,PyLoom将拥有围绕电子商务、房屋租售、社交网络、新闻媒体的数十个爬虫。
12 |
13 | 3. 升级爬虫,对于频繁使用的爬虫,增强其能力
14 | + 增强定制能力,例如支持限定地区、类别、关键字抓取;
15 | + 增强抓取策略,减少对代理、打码接口的使用;
16 | + 增强更新策略,更细粒度地计算重复抓取的时间。
17 |
18 | 目前进度,
19 |
20 | ①部分完成,开发常见爬虫够用了,随爬虫的开发迭代出更多功能;
21 |
22 | ②已有几款爬虫,放置于`spiders`目录。
23 |
24 |
25 |
26 | ## 安装
27 |
28 | 1. **环境要求**
29 |
30 | + python 3.6.0+
31 | + redis 2.6+
32 | + 类unix系统
33 |
34 | 2. **安装PyLoom**
35 |
36 | ```bash
37 | git clone https://github.com/spencer404/PyLoom.git
38 | python3.6 -m pip install -e ./PyLoom
39 | ```
40 |
41 | > 添加 `-i https://pypi.douban.com/simple` 参数,利用豆瓣镜像提速。
42 |
43 | >出现错误`fatal error: Python.h: No such file or directory`时,
44 | >
45 | >需安装对应平台的python3.x-devel包
46 | >
47 |
48 |
49 |
50 | ## 运行
51 |
52 | 以运行`spiders/WeiBo`为例,
53 |
54 | 1. **最简参数启动爬虫**
55 |
56 | ```bash
57 | pyloom run -s PyLoom/spiders/WeiBo
58 | ```
59 |
60 | >在爬虫目录中执行`run`时,可省略`-s`参数。
61 |
62 | 2. **启动代理池**
63 |
64 | ```bash
65 | pyloom proxy run
66 | ```
67 |
68 | 3. **添加代理**
69 |
70 | 根据命令提示,添加名为"xxx"的代理
71 |
72 | ```bash
73 | pyloom proxy add
74 | ```
75 |
76 | 4. **使用代理启动爬虫**
77 |
78 | ```bash
79 | pyloom run --proxy xxx
80 | ```
81 |
82 | 命令`run`的部分常用参数:
83 |
84 | ```bash
85 | -l, --level 日志级别
86 | -s, --spider 指定爬虫目录
87 | -r, --redis 指定redis地址(URL形式)
88 | -C, --clear 清空队列、代理数据后运行
89 | --proxy 使用指定代理运行,逗号分隔多个代理
90 | --damon 作为守护进程运行
91 | -p 子进程数量
92 | -t 每个子进程的线程数量
93 | ```
94 |
95 | 在多台服务器上运行时,若参数`-s、-r`所指向的目标相同,即可横向扩容性能。
96 |
97 | 默认地,PyLoom将抓到数据打印在日志中,你可以修改`on_save`函数自定义如何保存。
--------------------------------------------------------------------------------
/pyloom/errors.py:
--------------------------------------------------------------------------------
1 | class TaskError(Exception):
2 | """
3 | Task生命周期中出现的异常
4 | 若某Task抛出此异常,当前URL将被加入异常队列(error)
5 | """
6 |
7 | def __init__(self, tag):
8 | self.err = f"TaskError('{tag}')"
9 |
10 | def __str__(self):
11 | return self.err
12 |
13 |
14 | class TaskFinish(Exception):
15 | """提前结束生命周期,并将当前URL加入布隆过滤器"""
16 |
17 |
18 | class TaskBreak(Exception):
19 | """提前结束生命周期,并将当前URL归还到任务队列"""
20 |
21 | def __init__(self, priority=0):
22 | """
23 | Args:
24 | priority: 指定队列优先级
25 | """
26 | self.priority = priority
27 |
28 |
29 | class RetryExceeded(TaskError):
30 | """重试次数超限"""
31 |
32 | def __init__(self):
33 | self.err = "RetryExceeded"
34 |
35 |
36 | class RequestError(Exception):
37 | """请求异常"""
38 |
39 |
40 | class Timeout(RequestError):
41 | """请求超时"""
42 |
43 |
44 | class ProxyError(Exception):
45 | """代理异常"""
46 |
47 |
48 | class RetryError(Exception):
49 | """重试错误,需搭配tasks.retry装饰器使用"""
50 |
51 |
52 | class JSONDecodeError(ValueError):
53 | """使用Response().json时出现解码错误"""
54 |
55 |
56 | class DebuggerError(Exception):
57 | pass
58 |
59 |
60 | class SchedulerError(Exception):
61 | pass
62 |
63 |
64 | class ConfigError(Exception):
65 | def __init__(self, name, err=None):
66 | self.name = name
67 | self.err = err
68 |
69 | def __str__(self):
70 | s = f"配置'{self.name}'有误"
71 | if self.err is not None:
72 | s += f", {self.err}"
73 | return s
74 |
75 |
76 | class ConfigFileNotFoundError(ConfigError, FileNotFoundError):
77 | def __init__(self, file):
78 | self.file = file
79 |
80 | def __str__(self):
81 | return f"未找到配置文件:'{self.file}'"
82 |
83 |
84 | class ConfigNotNone(ConfigError, ValueError):
85 | def __init__(self, name):
86 | self.name = name
87 |
88 | def __str__(self):
89 | return f"缺少配置项:'{self.name}'"
90 |
91 |
92 | class BucketError(Exception):
93 | pass
94 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python template
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # dotenv
80 | .env
81 |
82 | # virtualenv
83 | venv/
84 | ENV/
85 |
86 | # Spyder project settings
87 | .spyderproject
88 |
89 | # Rope project settings
90 | .ropeproject
91 |
92 | ### JetBrains template
93 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
94 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
95 |
96 | # User-specific stuff:
97 | .idea/
98 | .DS_Store
99 |
100 | # Sensitive or high-churn files:
101 | .idea/dataSources.ids
102 | .idea/dataSources.xml
103 | .idea/dataSources.local.xml
104 | .idea/sqlDataSources.xml
105 | .idea/dynamic.xml
106 | .idea/uiDesigner.xml
107 |
108 | # Gradle:
109 | .idea/gradle.xml
110 | .idea/libraries
111 |
112 | # Mongo Explorer plugin:
113 | .idea/mongoSettings.xml
114 |
115 | ## File-based project format:
116 | *.iws
117 |
118 | ## Plugin-specific files:
119 |
120 | # IntelliJ
121 | /out/
122 |
123 | # mpeltonen/sbt-idea plugin
124 | .idea_modules/
125 |
126 | # JIRA plugin
127 | atlassian-ide-plugin.xml
128 |
129 | # Crashlytics plugin (for Android Studio and IntelliJ)
130 | com_crashlytics_export_strings.xml
131 | crashlytics.properties
132 | crashlytics-build.properties
133 | fabric.properties
134 |
135 | # pytest
136 | .pytest_cache/
137 |
138 | # pyloom
139 | __dev_*.py
140 | __debugger__/
141 | logs/
--------------------------------------------------------------------------------
/spiders/LaGou/README.md:
--------------------------------------------------------------------------------
1 | ## 拉钩爬虫
2 |
3 | ### 职位详情(JobDetails)
4 |
5 | | 字段 | 示例 | 说明 |
6 | | ------------------ | -------------------------------------------- | --------------------- |
7 | | result._id | 5080106 | 职位id |
8 | | result.title | 演员实习生 | 名称 |
9 | | result.label | ['移动互联网', '广告营销'] | 标签 |
10 | | result.job_request | 2k-4k/上海 /经验应届毕业生 /大专及以上 /实习 | 要求 |
11 | | result.advantage | 周末双休,地铁周边,做五休二,氛围融洽 | 职位诱惑 |
12 | | result.job_bt | 职位描述:岗位职责:1参与公司广告... | 职位描述 |
13 | | result.work_addr | 上海-徐汇区- 桂林路396号3号楼 | 工作地址 |
14 | | result.status | 0 | 状态,0:进行中 1:结束 |
15 | | result.job_company | 乐推(上海)文化传播有限公司 | 公司名字 |
16 | | result.type | 移动互联网,广告营销领域 | 类型 |
17 | | result.time | 2018-09-02 | 发布时间 |
18 |
19 | ```python
20 | self.result = {
21 | '_id': '5080106',
22 | 'title': '演员实习生',
23 | 'label': ['移动互联网', '广告营销'],
24 | 'job_request': '2k-4k/上海 /经验应届毕业生 /大专及以上 /实习',
25 | 'advantage': '周末双休,地铁周边,做五休二,氛围融洽',
26 | 'job_bt': '职位描述:岗位职责:1参与公司广告和短剧的拍摄;2负责公司项目前期筹备等的相关工作;3出演抖音广告与搞笑视频。任职要求:1长相甜美,外形清新亮丽,镜头感强,有强烈的表现力;2专科以上学历,表演系专业优先;3性格活泼开朗、思维活跃、为人正直;4工作态度积极;5仅仅招收女演员。',
27 | 'work_addr': '上海-徐汇区- 桂林路396号3号楼',
28 | 'status': 0,
29 | 'job_company': '乐推(上海)文化传播有限公司',
30 | 'type': '移动互联网,广告营销领域',
31 | 'time': '2018-09-02'
32 | }
33 | ```
34 |
35 |
36 |
37 | ### 公司详情(GongSiDetails)
38 |
39 | | 字段 | 示例 | 说明 |
40 | | ------------------------ | -------------------------- | -------- |
41 | | result._id | 324 | 公司id |
42 | | result.company_abbr | 爱立示 | 简称 |
43 | | result.company_full_name | 慈溪爱立示信息科技有限公司 | 全称 |
44 | | result.type | 信息安全,数据服务 | 类型 |
45 | | result.process | 未融资 | 融资状态 |
46 | | result.number | 15-50人 | 人数 |
47 | | result.address | 北京 | 公司地点 |
48 | | result.label | ['技能培训', '岗位晋升'] | 公司标签 |
49 | | result.website | http://www.alstru.com | 公司网站 |
50 |
51 | ```python
52 | self.result = {
53 | '_id': '324',
54 | 'company_abbr': '爱立示',
55 | 'company_full_name': '慈溪爱立示信息科技有限公司',
56 | 'type': '信息安全,数据服务',
57 | 'process': '未融资',
58 | 'number': '15-50人',
59 | 'address': '北京',
60 | 'label': ['技能培训', '岗位晋升', '扁平管理', '领导好', '五险一金', '弹性工作'],
61 | 'website': 'http://www.alstru.com'
62 | }
63 | ```
64 |
65 |
--------------------------------------------------------------------------------
/pyloom/lua/url_pop.lua:
--------------------------------------------------------------------------------
1 | -- 请求分配任务
2 | -- Keys: now
3 | -- Argv: name [name ...]
4 | -- Return: (url, name, address)
5 | local now = tonumber(KEYS[1])
6 | for i = 1, #ARGV do
7 | local name = ARGV[i] -- 爬虫名
8 | local key_spider = "spider:" .. name
9 | local status = tonumber(redis.call("HGET", key_spider, "status"))
10 | redis.call("HSET", key_spider, "last_heartbeat_time", now)
11 | -- 条件: 爬虫状态至少为就绪态
12 | if status >= 10 then
13 | local interval = cjson.decode(redis.call("HGET", key_spider, "interval"))
14 | local last_pop_time = cjson.decode(redis.call("HGET", key_spider, "last_pop_time"))
15 | -- 条件: 爬虫已到可用时间
16 | if now >= last_pop_time + interval then
17 | local proxies = cjson.decode(redis.call("HGET", key_spider, "proxies"))
18 | local address = false
19 | -- 爬虫被设置了代理,把代理池弹空也要弹出一个可用代理
20 | if #proxies ~= 0 then
21 | local recycle = {}
22 | while not address do
23 | address = redis.call("RPOP", "proxy:addresses:" .. name)
24 | -- 代理池空了,不再继续弹出
25 | if not address then
26 | break
27 | end
28 | local t1 = string.find(address, ":", 1)
29 | local t2 = string.find(address, ":", t1 + 1)
30 | local valid_at = tonumber(string.sub(address, 1, t1 - 1))
31 | local expire_at = tonumber(string.sub(address, t1 + 1, t2 - 1))
32 |
33 | if valid_at > now then
34 | -- 代理未到可用时间,放回去 -> 重新弹出
35 | table.insert(recycle, address)
36 | address = false
37 | else
38 | -- 代理已到可用时间,并未过期 -> 已拿到代理!
39 | if expire_at > now then
40 | break
41 | end
42 | -- 代理已到可用时间,已过期 -> 重新弹出
43 | -- 过期代理不归还
44 | end
45 | end
46 | if #recycle ~= 0 then
47 | redis.call("LPUSH", "proxy:addresses:" .. name, unpack(recycle))
48 | end
49 | end
50 | -- 条件: 爬虫未设置代理或代理池中可用代理不全为空
51 | if #proxies == 0 or address then
52 | local key_processing = "queue:" .. name .. ":processing"
53 | for priority = 0, 4 do
54 | -- 条件: 爬虫名下所有队列不全为空
55 | local key_waiting = "queue:" .. name .. ":waiting:" .. priority
56 | local url = redis.call("RPOP", key_waiting)
57 | -- 已满足所有条件,发布任务
58 | if url then
59 | -- 加入processing
60 | redis.call("HSET", key_processing, url, now)
61 | -- 更新last_pop_time
62 | redis.call("HSET", key_spider, "last_pop_time", now)
63 | return { url, name, address }
64 | end
65 | end
66 | -- 队列全部为空时,设置爬虫状态为已完成
67 | local processing_len = redis.call("HLEN", key_processing)
68 | if processing_len == 0 then
69 | redis.call("HSET", key_spider, "status", 0)
70 | end
71 | end
72 | end
73 | end
74 | end
75 |
76 | return { false, false, false }
77 |
--------------------------------------------------------------------------------
/pyloom/lua/url_add.lua:
--------------------------------------------------------------------------------
1 | -- 将一组URLs排重后添加至队列
2 | -- Keys: name priority
3 | -- Argv: url [url ...]
4 | -- Return: count,成功数量(不重复数量)
5 |
6 | local bloom_check = function(name, entries, precision, value)
7 | local prefix = "queue:" .. name .. ":filter:bloom"
8 | local count = redis.call('GET', prefix .. ':count')
9 | if not count then
10 | return 0
11 | end
12 |
13 | local factor = math.ceil((entries + count) / entries)
14 | -- 0.69314718055995 = ln(2)
15 | local index = math.ceil(math.log(factor) / 0.69314718055995)
16 | local scale = math.pow(2, index - 1) * entries
17 | local hash = redis.sha1hex(value)
18 |
19 | -- This uses a variation on:
20 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
21 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
22 | local h = {}
23 | h[0] = tonumber(string.sub(hash, 1, 8), 16)
24 | h[1] = tonumber(string.sub(hash, 9, 16), 16)
25 | h[2] = tonumber(string.sub(hash, 17, 24), 16)
26 | h[3] = tonumber(string.sub(hash, 25, 32), 16)
27 |
28 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
29 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
30 | -- 0.4804530139182 = ln(2)^2
31 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
32 | -- 0.69314718055995 = ln(2)
33 | local maxk = math.floor(0.69314718055995 * maxbits / scale)
34 | local b = {}
35 |
36 | for i = 1, maxk do
37 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
38 | end
39 |
40 | for n = 1, index do
41 | local key = prefix .. ':' .. n
42 | local found = true
43 | local scalen = math.pow(2, n - 1) * entries
44 |
45 | -- 0.4804530139182 = ln(2)^2
46 | local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
47 |
48 | -- 0.69314718055995 = ln(2)
49 | local k = math.floor(0.69314718055995 * bits / scalen)
50 |
51 | for i = 1, k do
52 | if redis.call('GETBIT', key, b[i] % bits) == 0 then
53 | found = false
54 | break
55 | end
56 | end
57 |
58 | if found then
59 | return 1
60 | end
61 | end
62 |
63 | return 0
64 | end
65 |
66 | local name = KEYS[1]
67 | local priority = KEYS[2]
68 | local key_spider = "spider:" .. name
69 | local key_waiting = "queue:" .. name .. ":waiting:" .. priority
70 | local key_filter_queue = "queue:" .. name .. ":filter:queue"
71 | local filter = {} -- 对ARGV排重
72 | local urls = {} -- 将被批量添加到waiting队列的URL
73 |
74 | -- 从爬虫配置中读取布隆参数
75 | local precision = redis.call('HGET', key_spider, 'precision')
76 | if not precision then
77 | return { err = "爬虫未配置'precision'" }
78 | end
79 |
80 | -- 排重
81 | for i = 1, #ARGV do
82 | local url = ARGV[i]
83 | local exists = filter[url] or
84 | bloom_check(name, 1000000, precision, url) == 1 or
85 | redis.call('SISMEMBER', key_filter_queue, url) == 1
86 | if not exists then
87 | filter[url] = true
88 | table.insert(urls, url)
89 | end
90 | end
91 |
92 | -- 加入队列
93 | if #urls == 0 then
94 | return 0
95 | else
96 | redis.call('LPUSH', key_waiting, unpack(urls))
97 | redis.call('SADD', key_filter_queue, unpack(urls))
98 | return #urls
99 | end
100 |
--------------------------------------------------------------------------------
/pyloom/lua/bloom_check.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | The MIT License (MIT)
3 |
4 | Copyright (c) 2017 Erik Dubbelboer
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | ]]
24 |
25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改
26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter
27 |
28 | local bloom_check = function(name, entries, precision, value)
29 | local prefix = "queue:" .. name .. ":filter:bloom"
30 | local count = redis.call('GET', prefix .. ':count')
31 | if not count then
32 | return 0
33 | end
34 |
35 | local factor = math.ceil((entries + count) / entries)
36 | -- 0.69314718055995 = ln(2)
37 | local index = math.ceil(math.log(factor) / 0.69314718055995)
38 | local scale = math.pow(2, index - 1) * entries
39 | local hash = redis.sha1hex(value)
40 |
41 | -- This uses a variation on:
42 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
43 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
44 | local h = {}
45 | h[0] = tonumber(string.sub(hash, 1, 8), 16)
46 | h[1] = tonumber(string.sub(hash, 9, 16), 16)
47 | h[2] = tonumber(string.sub(hash, 17, 24), 16)
48 | h[3] = tonumber(string.sub(hash, 25, 32), 16)
49 |
50 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
51 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
52 | -- 0.4804530139182 = ln(2)^2
53 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
54 | -- 0.69314718055995 = ln(2)
55 | local maxk = math.floor(0.69314718055995 * maxbits / scale)
56 | local b = {}
57 |
58 | for i = 1, maxk do
59 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
60 | end
61 |
62 | for n = 1, index do
63 | local key = prefix .. ':' .. n
64 | local found = true
65 | local scalen = math.pow(2, n - 1) * entries
66 |
67 | -- 0.4804530139182 = ln(2)^2
68 | local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
69 |
70 | -- 0.69314718055995 = ln(2)
71 | local k = math.floor(0.69314718055995 * bits / scalen)
72 |
73 | for i = 1, k do
74 | if redis.call('GETBIT', key, b[i] % bits) == 0 then
75 | found = false
76 | break
77 | end
78 | end
79 |
80 | if found then
81 | return 1
82 | end
83 | end
84 |
85 | return 0
86 | end
87 |
88 | -- 从爬虫配置中读取布隆参数
89 | local name = KEYS[1]
90 | local key_spider = "spider:" .. name
91 | local precision = redis.call('HGET', key_spider, 'precision')
92 | if not precision then
93 | return { err = "爬虫未配置'precision'" }
94 | end
95 |
96 | return bloom_check(name, 1000000, precision, ARGV[1])
97 |
--------------------------------------------------------------------------------
/spiders/DouBanBooks/tasks.py:
--------------------------------------------------------------------------------
1 | from pyloom.errors import *
2 | from pyloom.tasks import Task, CSS, retry
3 |
4 |
5 | class BaseTask(Task):
6 | @retry(10, 0)
7 | def on_download(self):
8 | """下载页面"""
9 | try:
10 | response = self.client.get(
11 | url=self.url,
12 | allow_redirects=False,
13 | headers={
14 | "Host": "book.douban.com",
15 | "User-Agent": self.ua.chrome
16 | },
17 | timeout=8
18 | )
19 | except (ProxyError, RequestError):
20 | self.client.reload_proxy()
21 | raise RetryError
22 | # 检查是否被封禁
23 | if response.status_code == 200:
24 | s = 'window.location.href="https://sec.douban.com/a'
25 | if s in response.text:
26 | self.logger.warning("IP被封禁:200", self.client.address)
27 | self.client.reuse_proxy(300)
28 | else:
29 | self.client.reuse_proxy()
30 | return response
31 | elif response.status_code == 302:
32 | self.logger.warning("IP被封禁:302", self.client.address)
33 | self.client.reuse_proxy(300)
34 | else:
35 | self.logger.warning("请求错误", response.status_code)
36 | self.client.reload_proxy()
37 | raise RetryError
38 |
39 | def parse_tag_urls(self):
40 | """提取页面中所有的标签链接"""
41 | # 获取所有标签详情页的相对路径
42 | paths = self.response.re.many("/tag/\w+")
43 | # 构造每个标签前50页的标签详情页链接,优先级为2:最低
44 | return [
45 | f"https://book.douban.com{path}?start={i*20}&type=R"
46 | for path in paths for i in range(50)
47 | ]
48 |
49 |
50 | class BookDetailsTask(BaseTask):
51 | """图书详情页"""
52 | filters = ["https://book.douban.com/subject/(\d+)/"]
53 |
54 | def on_parse(self):
55 | css = self.response.css
56 | # 书籍基本信息
57 | info = {}
58 | for line in css.one("div#info").html().split("
"):
59 | items = [
60 | ' '.join(s.split())
61 | for s in CSS(line).text(separator=" ").split(":", 1)
62 | if s.strip()
63 | ]
64 | if len(items) == 2:
65 | info[items[0]] = items[1]
66 | result = {
67 | "title": css.one("h1 > span").text(),
68 | "cover": css.one("div#mainpic img").attrs.get("src", None),
69 | "info": info,
70 | "rating_num": css.one("div.rating_self > strong.rating_num").text() or None,
71 | "rating_people": css.one("a.rating_people > span").default(None).text(),
72 | "intro": css.one("div#link-report div.intro > p").default(None).text(separator="\n"),
73 | "tags": [n.text() for n in css.many("div#db-tags-section a")],
74 | }
75 | return result
76 |
77 | def on_link(self):
78 | books = self.response.re.many("https://book.douban.com/subject/\d+/")
79 | # 指定优先级
80 | return {
81 | 0: books,
82 | 4: self.parse_tag_urls()
83 | }
84 |
85 | def on_save(self):
86 | self.logger.info("抓到新书", self.result)
87 |
88 |
89 | class TagsTask(BaseTask):
90 | """热门标签页"""
91 | filters = ["https://book.douban.com/tag/\?view=cloud"]
92 |
93 | def on_link(self):
94 | return self.parse_tag_urls()
95 |
96 |
97 | class TagDetailsTask(BaseTask):
98 | """标签详情页"""
99 | filters = ["https://book.douban.com/tag/(\w+)\?start=(\d+)&type=R"]
100 |
101 | def on_link(self):
102 | books = self.response.re.many("https://book.douban.com/subject/\d+/")
103 | return {
104 | 0: books,
105 | 4: self.parse_tag_urls()
106 | }
107 |
--------------------------------------------------------------------------------
/pyloom/lua/bloom_cas.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | The MIT License (MIT)
3 |
4 | Copyright (c) 2017 Erik Dubbelboer
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | ]]
24 |
25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改
26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter
27 |
28 | local bloom_cas = function(name, entries, precision, value)
29 | local hash = redis.sha1hex(value)
30 | local prefix = "queue:" .. name .. ":filter:bloom"
31 | local countkey = prefix .. ':count'
32 | local count = redis.call('GET', countkey)
33 | if not count then
34 | count = 1
35 | else
36 | count = count + 1
37 | end
38 |
39 | local factor = math.ceil((entries + count) / entries)
40 | -- 0.69314718055995 = ln(2)
41 | local index = math.ceil(math.log(factor) / 0.69314718055995)
42 | local scale = math.pow(2, index - 1) * entries
43 |
44 | -- This uses a variation on:
45 | -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
46 | -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
47 | local h = {}
48 | h[0] = tonumber(string.sub(hash, 1, 8), 16)
49 | h[1] = tonumber(string.sub(hash, 9, 16), 16)
50 | h[2] = tonumber(string.sub(hash, 17, 24), 16)
51 | h[3] = tonumber(string.sub(hash, 25, 32), 16)
52 |
53 | -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
54 | -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
55 | -- 0.4804530139182 = ln(2)^2
56 | local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
57 |
58 | -- 0.69314718055995 = ln(2)
59 | local maxk = math.floor(0.69314718055995 * maxbits / scale)
60 | local b = {}
61 |
62 | for i = 1, maxk do
63 | table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
64 | end
65 |
66 | -- Only do this if we have data already.
67 | if index > 1 then
68 | -- The last fiter will be handled below.
69 | for n = 1, index - 1 do
70 | local key = prefix .. ':' .. n
71 | local scale = math.pow(2, n - 1) * entries
72 |
73 | -- 0.4804530139182 = ln(2)^2
74 | local bits = math.floor((scale * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
75 |
76 | -- 0.69314718055995 = ln(2)
77 | local k = math.floor(0.69314718055995 * bits / scale)
78 |
79 | local found = true
80 | for i = 1, k do
81 | if redis.call('GETBIT', key, b[i] % bits) == 0 then
82 | found = false
83 | break
84 | end
85 | end
86 |
87 | if found then
88 | return 1
89 | end
90 | end
91 | end
92 |
93 | -- For the last filter we do a SETBIT where we check the result value.
94 | local key = prefix .. ':' .. index
95 |
96 | local found = 1
97 | for i = 1, maxk do
98 | if redis.call('SETBIT', key, b[i] % maxbits, 1) == 0 then
99 | found = 0
100 | end
101 | end
102 |
103 | if found == 0 then
104 | -- INCR is a little bit faster than SET.
105 | redis.call('INCR', countkey)
106 | end
107 |
108 | return found
109 | end
110 |
111 | -- 从爬虫配置读取布隆参数
112 | local name = KEYS[1]
113 | local key_spider = "spider:" .. name
114 | local precision = redis.call('HGET', key_spider, 'precision')
115 | if not precision then
116 | return { err = "爬虫未配置'precision'" }
117 | end
118 |
119 | return bloom_cas(name, 1000000, precision, ARGV[1])
120 |
--------------------------------------------------------------------------------
/pyloom/drivers.py:
--------------------------------------------------------------------------------
1 | import furl
2 | import time
3 | import logging
4 | import requests
5 | import traceback
6 | from . import utils
7 |
8 | logger = logging.getLogger("drivers")
9 |
10 |
11 | class ProxyDriver(object):
12 | """代理驱动的基类,必须继承此类,否则驱动不能被识别"""
13 |
14 | def __init__(self, **kwargs):
15 | """在代理启动时传入自定义参数"""
16 | self.url = kwargs['url']
17 | self.interval = kwargs['interval']
18 | self.parallel = kwargs['parallel']
19 |
20 | @classmethod
21 | def get_params(cls):
22 | """获取自定义参数"""
23 | template = [
24 | {
25 | 'name': 'url',
26 | 'title': '代理提取接口?',
27 | 'example': 'http://api.example.com',
28 | 'regex': 'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
29 | },
30 | {
31 | 'name': 'interval',
32 | 'title': '每隔多少秒调用一次接口?',
33 | 'type': int,
34 | 'note': "0-无间隔"
35 | },
36 | {
37 | 'name': 'parallel',
38 | 'title': '每个代理能被多少线程并发使用?',
39 | 'type': int
40 | }
41 | ]
42 | return utils.template_input(template)
43 |
44 | def gen_addresses(self):
45 | """
46 | 返回一个生成器,每次迭代时返回一个代理,其格式为:
47 | valid_at:expire_at:address
48 | valid_at : 当前时间大于valid_at时代理可用
49 | expire_at: 当前时间小于expire_at时代理可用,大于expire_at时被删除
50 | address : 代理地址,支持http、https、socks5协议
51 | """
52 | raise NotImplementedError
53 |
54 |
55 | class MoGuProxy(ProxyDriver):
56 | title = '蘑菇API代理'
57 |
58 | def gen_addresses(self):
59 | logger.info("代理已启动", self.title, self.url)
60 | while True:
61 | try:
62 | time.sleep(self.interval / 2) # 接口故障时睡眠一半时间
63 | try:
64 | resp = requests.get(self.url, timeout=1)
65 | except Exception as e:
66 | yield False, f"接口请求异常:{e}"
67 | continue
68 |
69 | if resp.status_code != 200:
70 | yield False, f"接口状态码异常:{resp.status_code}"
71 | continue
72 |
73 | try:
74 | data = resp.json()
75 | except Exception:
76 | yield False, f"接口返回值非JSON格式"
77 | continue
78 |
79 | if int(data.get('code', -1)) != 0:
80 | yield False, f'接口返回异常:{data.get("msg", "unknown")}'
81 | continue
82 |
83 | expire_at = time.time() + 600
84 | addresses = [f"0:{expire_at}:http://{i['ip']}:{i['port']}" for i in data.get('msg', [])]
85 | yield True, addresses * self.parallel
86 | time.sleep(self.interval / 2)
87 | except Exception as e:
88 | logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc())
89 |
90 |
91 | class MiPuProxy(ProxyDriver):
92 | title = "米扑开放代理"
93 |
94 | def gen_addresses(self):
95 | logger.info("代理已启动", self.title, self.url)
96 | while True:
97 | try:
98 | url = furl.furl(self.url)
99 | url.query.params.set('result_format', 'json')
100 | time.sleep(self.interval / 2) # 接口故障时睡眠一半时间
101 | try:
102 | resp = requests.get(url, timeout=1)
103 | except Exception as e:
104 | yield False, f"接口请求异常:{e}"
105 | continue
106 |
107 | if resp.status_code != 200:
108 | yield False, f"接口状态码异常:{resp.status_code}"
109 | continue
110 |
111 | try:
112 | data = resp.json()
113 | except Exception:
114 | yield False, f"接口返回值非JSON格式"
115 | continue
116 |
117 | if int(data.get('code', -1)) != 0:
118 | yield False, f'接口返回异常:{data.get("msg", "unknown")}'
119 | continue
120 |
121 | expire_at = time.time() + 60 * 60 * 24 * 30 * 12 # 有效期一年
122 | addresses = []
123 | for item in data.get('result', []):
124 | scheme = item['http_type'].lower()
125 | server = item['ip:port']
126 | addresses.append(f"0:{expire_at}:{scheme}://{server}")
127 |
128 | yield True, addresses * self.parallel
129 | time.sleep(self.interval / 2)
130 | except Exception as e:
131 | logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc())
132 |
--------------------------------------------------------------------------------
/spiders/Ziroom/README.md:
--------------------------------------------------------------------------------
1 | ## 自如爬虫
2 |
3 | ### 安装
4 |
5 | 1. [安装tesseract-ocr](https://github.com/tesseract-ocr/tesseract/wiki)
6 |
7 | 2. 安装pytesseract库
8 |
9 | ```
10 | pip install pytesseract
11 | ```
12 |
13 |
14 | ### 房源列表(NLTask)
15 |
16 | 房源列表中的房源信息
17 |
18 | | 字段 | 示例 | 说明 |
19 | | ---------------- | ----------------------------------------------------- | ------------ |
20 | | result.price | 1160 | 价格 |
21 | | result.href | www.ziroom.com/z/vr/61441027.html | 详情信息链接 |
22 | | result.img_src | static8.ziroom.com/phoenix/pc/images/list/loading.jpg | 图片 |
23 | | result.block | 天恒乐活城 | 小区名 |
24 | | result.name | 整租 · 天恒乐活城2居室-南 | 房源名 |
25 | | result.site | [通州通州其它] 亦庄线次渠南 | 位置 |
26 | | result.detail | 14.1 ㎡\|6/6层\|3室1厅距15号线石门站690米 | 细节 |
27 | | result.room_tags | ['离地铁近', '独立阳台', '集体供暖', '友家3.0 木棉'] | 标签 |
28 |
29 | ```python
30 | self.result = [
31 | {
32 | 'price': '1830',
33 | 'href': 'www.ziroom.com/z/vr/61514855.html',
34 | 'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg',
35 | 'block': '世茂维拉', 'name': '友家 · 世茂维拉5居室-南卧',
36 | 'site': '[房山长阳] 房山线广阳城',
37 | 'detail': '12.3 ㎡|5/5层|5室1厅距房山线广阳城站696米有2间空房',
38 | 'room_tags': ['离地铁近', '独卫', '集体供暖', '友家4.0 拿铁']
39 | },
40 | {
41 | 'price': '1830',
42 | 'href': 'www.ziroom.com/z/vr/261810.html',
43 | 'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg',
44 | 'block': '前进花园石门苑',
45 | 'name': '友家 · 前进花园石门苑3居室-南卧',
46 | 'site': '[顺义顺义城] 15号线石门',
47 | 'detail': '14.1 ㎡|6/6层|3室1厅距15号线石门站690米',
48 | 'room_tags': ['离地铁近', '独立阳台', '集体供暖', '友家4.0 布丁']
49 | },
50 | ...
51 | ]
52 | ```
53 |
54 |
55 |
56 | ### 房源详情(VRTask)
57 |
58 | | 字段 | 示例 | 说明 |
59 | | ------------------------ | -------------------------------------------------------- | ------------------------------------------ |
60 | | result.img | ['http://pic.ziroom.com/house_images.jpg',...] | 介绍图片 |
61 | | result.room_name | 和平家园小区4居室-02卧 | 名称 |
62 | | result.ellipsis | [昌平 沙河] 昌平线 昌平 | 位置 |
63 | | result.room_id | 61264525 | 房间id |
64 | | result.house_id | 60203175 | 房屋id |
65 | | result.current_city_code | 110000 | 所在城市编码,见[附录](#current_city_code) |
66 | | result.detail_room | {'面积': '15.4㎡', '朝向': '南', '户型': '4室1厅合',...} | 房屋参数 |
67 | | result.number | BJZRGY0818215849_02 | 编号 |
68 | | result.periphery | 学校:中国政法大学法学院、中国石油大学... | 周边 |
69 | | result.traffic | 公交:314路、昌平9路... | 交通 |
70 | | result.configuration | ['bed', 'desk', 'chest', 'calorifier'] | 配置 |
71 | | result.roommate | [{性别': 'man', '房间号': '01卧', '星座': '天蝎座',...}] | 室友信息 |
72 | | result.price | 1890 | 价格 |
73 |
74 | ```python
75 | self.result = {
76 | 'img':[
77 | 'http://pic.ziroom.com/house_images/g2m1/M00/5B/DF/v180x135.jpg',
78 | 'http://pic.ziroom.com/house_images/g2m1/M00/5D/0B/v180x135.jpg'
79 | ],
80 | 'room_name': '和平家园小区4居室-02卧',
81 | 'ellipsis': '[昌平 沙河] 昌平线 昌平',
82 | 'room_id': '61264525',
83 | 'house_id': '60203175',
84 | 'current_city_code': '110000',
85 | 'detail_room': {
86 | '面积': '15.4㎡',
87 | '朝向': '南',
88 | '户型': '4室1厅合',
89 | '楼层': '6/6层',
90 | '交通': '距15号线石门307米距15号线顺义1621米距15号线南法信2290米'
91 | },
92 | 'number': 'BJZRGY0818215849_02',
93 | 'periphery': '学校:中国政法大学法学院 医院:北京化工大学校医院',
94 | 'traffic': '公交:314路、昌平9路、914路、昌平3路、昌平5路、326路、345路',
95 | 'configuration': ['bed', 'desk', 'chest', 'calorifier],
96 | 'roommate':[
97 | {
98 | '性别': 'man',
99 | '房间号': '01卧',
100 | '星座': '天蝎座',
101 | '职业': '产品',
102 | '入住时间': '2018/07-2019/07'
103 | },
104 | {
105 | '性别': 'current',
106 | '房间号': '02卧',
107 | '星座': '…',
108 | '职业': '…',
109 | '入住时间': '…'
110 | }
111 | ],
112 | 'price': 1890
113 | }
114 | ```
115 |
116 | + 房间室友(result.roommate)不存在时,性别为current,'…'代表信息为空
117 |
118 | ### 附录
119 |
120 | #### current_city_code
121 |
122 | | 城市编码 | 城市名称 |
123 | | -------- | -------- |
124 | | 110000 | 北京 |
125 | | 310000 | 上海 |
126 | | 440300 | 深圳 |
127 | | 330100 | 杭州 |
128 | | 320100 | 南京 |
129 | | 440100 | 广州 |
130 | | 510100 | 成都 |
131 | | 420100 | 武汉 |
132 | | 120000 | 天津 |
133 |
134 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuo/README.md:
--------------------------------------------------------------------------------
1 | ## 拼多多爬虫
2 |
3 | #### 分类商品列表(OperationTask)
4 |
5 | 搜索栏分类商品列表
6 |
7 | | 字段 | 示例 | 说明 |
8 | | ------------------- | ------------------------------------------------------------ | ------------ |
9 | | result.goods_id | 2721076214 | 商品id |
10 | | result.goods_name | 【两件装】秋季男装长袖t恤... | 商品名称 |
11 | | result.thumb_url | http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg | 商品图片链接 |
12 | | result.cnt | 545 | 已售数量 |
13 | | result.normal_price | 2500 | 商品售价 |
14 | | result.market_price | 9900 | 商品标价 |
15 | | result.price | 1280 | 商品拼团价 |
16 | | result.updated_at | 2018-09-02 13:58:08.176553 | 爬取时间 |
17 |
18 | ```python
19 | self.result = [
20 | {
21 | 'goods_id': 2721076214,
22 | 'goods_name': '【两件装】秋季男装长袖t恤青年韩版潮流上衣学生修身百搭打底衫',
23 | 'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg',
24 | 'cnt': 545,
25 | 'normal_price': 2500,
26 | 'market_price': 9900,
27 | 'price': 1280,
28 | 'updated_at': '2018-09-02 13:58:08.176553'
29 | },
30 | {
31 | 'goods_id': 142150779,
32 | 'goods_name': '【花花公子贵宾】春夏秋款宽松直筒牛仔裤男弹力休闲商务大码长裤',
33 | 'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-08-14/a18025b1e91445e5ac2acb26be773cd1.jpeg',
34 | 'cnt': 294167,
35 | 'normal_price': 4990,
36 | 'market_price': 29800,
37 | 'price': 2990,
38 | 'updated_at': '2018-09-02 13:58:08.176553'
39 | }
40 | ...
41 | ]
42 | ```
43 |
44 |
45 |
46 | #### 商品详情(GoodsTask)
47 |
48 | | 字段 | 示例 | 说明 |
49 | | ------------------- | ------------------------------------ | ---------------------------- |
50 | | result.goods_sn | "1805231480604761" | sn码 |
51 | | result.goods_id | 1480604761 | 商品id |
52 | | result.cat_id | 9813 | 搜索id |
53 | | result.goods_name | 【凡爱宝贝】3d立体墙贴自粘... | 名称 |
54 | | result.goods_desc | 【3d立体墙贴】【环保无味】... | 简介 |
55 | | result.market_price | 3500 | 标价 |
56 | | result.is_onsale | 1 | 是否在售,0:下架 1:出售 |
57 | | result.thumb_url | http://t00img.yangkeduo.com/... | 商品图标 |
58 | | result.hd_thumb_url | http://t00img.yangkeduo.com/... | 商品放大图标 |
59 | | result.image_url | http://t00img.yangkeduo.com/... | 商品图片链接 |
60 | | result.price | {"min_on_sale_group_price": 358,...} | 商品价格,见[详情](#price) |
61 | | result.gallery | [{"id": 34954263707,'url':...}] | 商品详情介绍 |
62 | | result.created_at | 1527069514 | 创建时间戳 |
63 | | result.sales | 167701 | 销售量 |
64 | | result.cat_id_list | [9316, 9402, 9813] | 商品多级分类 |
65 | | result.sku | [{"sku_id": 33940681934,...}] | 商品规格详情,见[详情](#sku) |
66 |
67 | ```python
68 | self.result = {
69 | "goods_sn": "1805231480604761",
70 | "goods_id": 1480604761,
71 | "cat_id": 9813,
72 | "goods_name": "【凡爱宝贝】3d立体墙贴自粘防水墙纸防撞壁纸客厅卧室砖纹贴纸",
73 | "goods_desc": "【3d立体墙贴】【环保无味】【无甲醛 免胶自粘】绿色环保、无毒、无味,免人工,带胶撕开底纸即可粘贴,产品粘性强,不易脱落,具有很好的防撞、防水、防潮效果,易遮盖污点,环保无异味,施工简单,规格:70cm宽X77cm高; 工厂直销,砖纹形,装饰儿童房、卧室、客厅背景墙、走廊,也可发挥想象自由裁剪DIY。【计算方式】长x宽=面积,总面积÷单片面积=片数,一片尺寸是70cm宽X77cm高=0.539平方【友情提示】为避免不够,建议需要尽量多买2片备着,因为不同批次颜色有可能存在差异,所以请亲们一次购买足够。",
74 | "market_price": 3500,
75 | "is_onsale": 1,
76 | "thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/f9ba2f52be83d2f55142c55f44ec678c.jpeg",
77 | "hd_thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/8c5790dfea2422328ee3a487f3685ed6.jpeg",
78 | "image_url": "http://t00img.yangkeduo.com/goods/images/2018-07-22/95065d45399fce770bb49de0fba5c590.jpeg",
79 | "goods_type": 1,
80 | "gallery": [
81 | {
82 | "id": 34954263707,
83 | "url": "http://t00img.yangkeduo.com/t10img/images/2018-07-16/d864ed35818e90521cf858951d9dc349.jpeg"
84 | }
85 | ],
86 | "created_at": 1527069514,
87 | "sales": 167701,
88 | "price": {
89 | "min_on_sale_group_price": 358,
90 | "max_on_sale_group_price": 781,
91 | "min_on_sale_normal_price": 490,
92 | "max_on_sale_normal_price": 1500,
93 | "min_group_price": 358,
94 | "max_group_price": 781,
95 | "max_normal_price": 1500,
96 | "min_normal_price": 490,
97 | "old_min_on_sale_group_price": 390,
98 | "old_max_on_sale_group_price": 860,
99 | "old_min_group_price": 390,
100 | "old_max_group_price": 860
101 | },
102 | "cat_id_list": [9316, 9402, 9813],
103 | "sku": [
104 | {
105 | "sku_id": 33940681934,
106 | "goods_id": 1480604761,
107 | "thumb_url": "http://t00img.yangkeduo.com/t07img/images/2018-07-12/94b7c9302b62c64e22914e6e36fb9d40.png",
108 | "quantity": 0,
109 | "normal_price": 1500,
110 | "group_price": 561,
111 | "old_group_price": 610,
112 | "specs": [
113 | {
114 | "spec_key": "尺寸",
115 | "spec_value": "尺寸70*77厘米/1张"
116 | },
117 | {
118 | "spec_key": "颜色",
119 | "spec_value": "特价白色(70*77厘米)"
120 | }
121 | ]
122 | }
123 | ]
124 | }
125 | ```
126 |
127 | + cat_id_list为商品的多级分类栏id,依次为商品一级分类、商品二级分类、商品三级分类
128 |
129 | ### 附录
130 |
131 | #### price
132 |
133 | | 值 | 含义 |
134 | | --------------------------- | ---------------------- |
135 | | min_on_sale_group_price | 在售商品团购最低价 |
136 | | max_on_sale_group_price | 在售商品团购最高价 |
137 | | min_on_sale_normal_price | 在售商品最低价 |
138 | | max_on_sale_normal_price | 在售商品最高价 |
139 | | min_group_price | 商品团购最低价 |
140 | | max_group_price | 商品团购最高价 |
141 | | max_normal_price | 商品最高价 |
142 | | min_normal_price | 商品最低价 |
143 | | old_min_on_sale_group_price | 在售商品团购旧的最低价 |
144 | | old_max_on_sale_group_price | 在售商品团购旧的最高价 |
145 | | old_min_group_price | 商品团购旧的最低价 |
146 | | old_max_group_price | 商品团购旧的最高价 |
147 |
148 | #### sku
149 |
150 | | 值 | 含义 |
151 | | ---------------- | ------------ |
152 | | sku_id | 规格id |
153 | | goods_id | 商品id |
154 | | thumb_url | 规格图片链接 |
155 | | quantity | 数据 |
156 | | normal_price | 标价 |
157 | | group_price | 团购价 |
158 | | old_group_price | 旧的团购价 |
159 | | specs.spec_key | 规格参数 |
160 | | specs.spec_value | 规格参数值 |
161 |
162 |
--------------------------------------------------------------------------------
/pyloom/proxy.py:
--------------------------------------------------------------------------------
1 | import json
2 | import redis
3 | import traceback
4 | from .utils import *
5 | from .errors import *
6 | from . import drivers
7 | from .scheduler import Spider
8 | from threading import Thread, Lock
9 |
10 | logger = logging.getLogger("proxy")
11 |
12 |
13 | def proxy_handler(redis_conf, names, name, router, router_lock, driver, **params):
14 | """
15 | 从驱动获取代理,推送至指定爬虫的代理池中
16 | 每只爬虫有一个代理池,键名为'proxy:proxies:',list型
17 | [address1, address2, address3...]
18 | 其中address为str型,结构为:'valid_at:expire_at:address'
19 | valid_at: 代理生效时间
20 | expire_at: 代理失效时间
21 | address: 代理地址
22 | address在valid_at < now < expire_at可用,并在now > expire_at时被删除
23 | Args:
24 | redis_conf: Redis配置
25 | names: 所有存活代理
26 | name: 当前代理名,通过name in names判断当前代理是否存活
27 | router: 路由表,获取代理后,根据路由推送至代理池
28 | router_lock: router的锁
29 | driver: 驱动类
30 | params: 驱动参数
31 | """
32 | db = redis.StrictRedis.from_url(redis_conf)
33 | gen = driver(**params).gen_addresses()
34 | for is_ok, result in gen:
35 | with router_lock:
36 | targets = router.get(name, [])
37 | logger.debug("代理正在运行", name, targets)
38 | if not targets:
39 | logger.info("代理退出,router中没有记录", name, driver)
40 | break
41 | if name not in names:
42 | logger.info("代理退出,names中没有记录", name, driver)
43 | break
44 | if is_ok:
45 | for target in targets:
46 | length = db.lpush(f"proxy:addresses:{target}", *result)
47 | logger.info(f"添加代理, 代理:{name},目标:{target}, 新增数量:{len(result)}, 当前数量:{length}\n", result)
48 | else:
49 | time.sleep(1)
50 | else:
51 | logger.warning("代理出现异常", name, result)
52 |
53 |
54 | def get_driver(driver_name):
55 | """获取并检查驱动是否正确"""
56 | if not hasattr(drivers, driver_name):
57 | raise ProxyError("未找到驱动", driver_name)
58 | driver = getattr(drivers, driver_name)
59 | if not issubclass(driver, drivers.ProxyDriver):
60 | raise ProxyError("驱动应继承自ProxyDriver", driver_name)
61 | if not hasattr(driver, 'title'):
62 | raise ProxyError("驱动缺少属性", f"{driver_name}.title")
63 | return driver
64 |
65 |
66 | def start(redis_conf):
67 | """
68 | 根据代理配置,维护代理线程池
69 | 代理配置为一个redis dict键,键名为proxy:configs,结构为:
70 | {
71 | proxy_name: {
72 | version: str, // 版本号,版本号变化时,代理线程将会重启
73 | driver: str, // 驱动名,对应proxy.py中的类
74 | **params // 驱动参数,将被传递给驱动
75 | }
76 | }
77 | """
78 | logger.info("代理池已启动")
79 | db = redis.StrictRedis.from_url(redis_conf)
80 | threads = {} # 配置表,{proxy_name: {'version': int, 'thread': Thread}}
81 | router = {} # 路由表,{proxy_name: set([spider_name])}
82 | router_lock = Lock()
83 | for i in itertools.count():
84 | try:
85 | time.sleep(3 if i else 0)
86 | # 更新路由表,指示代理线程拿到代理后要推给哪些爬虫
87 | _router = {}
88 | for spider_name in Spider.names(db):
89 | spider = Spider(db, spider_name)
90 | if spider.get_field("status") < 10:
91 | logger.debug("忽略未就绪爬虫", spider_name)
92 | continue
93 | last_heartbeat_time = spider.get_field("last_heartbeat_time")
94 | if time.time() - last_heartbeat_time > 300:
95 | logger.debug("忽略长久未运行的爬虫", spider_name)
96 | continue
97 | proxies = Spider(db, spider_name).get_field("proxies")
98 | if not proxies:
99 | logger.debug("忽略未配置代理的爬虫", spider_name)
100 | continue
101 | for proxy_name in proxies:
102 | _router.setdefault(proxy_name, set()).add(spider_name)
103 | with router_lock:
104 | router.clear()
105 | router.update(_router)
106 |
107 | # 标记失效线程
108 | configs = {
109 | k.decode(): json.loads(v) for k, v in db.hgetall("proxy:configs").items()
110 | }
111 | logger.debug("代理配置", configs)
112 | marked_threads = {} # 被标记退出的线程,结构同threads
113 | for proxy_name, fields in threads.items():
114 | if proxy_name not in configs:
115 | logger.info("标记配置被删的代理", proxy_name)
116 | marked_threads[proxy_name] = fields
117 | continue
118 | if fields['version'] != configs[proxy_name]['version']:
119 | logger.info("标记配置更新的代理", proxy_name)
120 | marked_threads[proxy_name] = fields
121 | continue
122 | if proxy_name not in router:
123 | logger.info("标记已无爬虫的代理", proxy_name)
124 | marked_threads[proxy_name] = fields
125 | continue
126 | if not fields['thread'].is_alive():
127 | logger.info("标记异常退出的代理", proxy_name)
128 | marked_threads[proxy_name] = fields
129 | continue
130 |
131 | # 销毁被标记的线程
132 | # 线程看见自己没在threads中时会终止
133 | if marked_threads:
134 | for proxy_name in marked_threads.keys():
135 | del threads[proxy_name]
136 | with router_lock:
137 | if proxy_name in router:
138 | del router[proxy_name]
139 | logger.info("等待被标记代理线程退出", list(marked_threads.keys()))
140 | for _ in range(300):
141 | alive = any([t['thread'].is_alive() for t in marked_threads.values()])
142 | if not alive:
143 | break
144 | time.sleep(1)
145 | else:
146 | logger.error("被标记代理线程超时仍未退出")
147 | threads.update(marked_threads)
148 | time.sleep(3)
149 | continue
150 | logger.info("被标记代理线程已全部退出")
151 | # 启动新线程
152 | proxy_names_new = set(configs.keys()) - set(threads.keys())
153 | if proxy_names_new:
154 | for proxy_name in proxy_names_new:
155 | targets = router.get(proxy_name, [])
156 | if not targets:
157 | logger.debug("代理名下没有爬虫,暂不启动", proxy_name)
158 | continue
159 | logger.info("启动代理线程", proxy_name)
160 | version = configs[proxy_name].pop('version')
161 | driver = get_driver(configs[proxy_name].pop('driver'))
162 | t = Thread(
163 | target=proxy_handler,
164 | args=(
165 | redis_conf, threads, proxy_name, router, router_lock, driver
166 | ),
167 | kwargs=(configs[proxy_name]),
168 | daemon=True
169 | )
170 | threads[proxy_name] = {
171 | 'version': version,
172 | 'thread': t
173 | }
174 | t.start()
175 | except KeyboardInterrupt:
176 | logger.info("收到Ctrl+C", 'proxy')
177 | break
178 | except Exception as e:
179 | logger.fatal("未处理的异常", type(e), e, '\n', traceback.format_exc())
180 |
--------------------------------------------------------------------------------
/spiders/Ziroom/tasks.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import requests
4 | import pytesseract
5 | import urllib.parse
6 | from PIL import Image
7 | from io import BytesIO
8 | from pyloom import tasks
9 | from pyloom.errors import *
10 |
11 |
12 | class ZiRoomTask(tasks.Task):
13 | @tasks.retry(5, 0)
14 | def on_download(self):
15 | # 解决列表页第一页冲突问题
16 | page = re.search('\\?p=1', self.url)
17 | if page is not None:
18 | self.client.reuse_proxy()
19 | raise TaskFinish
20 | try:
21 | response = self.client.get(
22 | url=self.url,
23 | allow_redirects=False,
24 | headers={
25 | "User-Agent": self.ua.chrome
26 | }
27 | )
28 | except (ProxyError, RequestError) as e:
29 | if self.on_download.count >= 5: # 重试两次后更换代理
30 | self.logger.debug("请求错误", e)
31 | self.client.reload_proxy()
32 | raise RetryError
33 |
34 | if "请核对您输入的页面地址是否正确" in response.text or "The requested URL could not be retrieved" in response.text:
35 | if self.on_download.count >= 5: # 重试两次后更换代理
36 | self.logger.info("请求次数", self.on_download.count)
37 | self.client.reload_proxy()
38 | else:
39 | self.client.reuse_proxy(5)
40 | raise RetryError
41 | if response.status_code == 302:
42 | if self.on_download.count >= 5: # 重试两次后更换代理
43 | self.logger.info("请求次数", self.on_download.count)
44 | self.client.reload_proxy()
45 | else:
46 | self.client.reuse_proxy(5)
47 | raise RetryError
48 | if response.status_code == 500:
49 | raise RetryError
50 | self.client.reuse_proxy()
51 | return response
52 |
53 | @staticmethod
54 | def get_price(response):
55 | """通过图像匹配返回房租价格"""
56 | image_url = re.search('static8.ziroom.com/phoenix/pc/images/price/(\w+).png', response.text)[0]
57 | image = Image.open(BytesIO(requests.get(f'http://{image_url}').content))
58 | digital_table = pytesseract.image_to_string(image, config='--psm 7')
59 | offset_list = re.search('\\[((,)?\\[(\w)(,\w)+\\])+\\]', response.text)[0]
60 | price_list = []
61 | for offset in offset_list[2:-2].split('],['):
62 | a = ""
63 | for offset_num in offset.split(','):
64 | a = a + (digital_table[int(offset_num)])
65 | price_list.append(a)
66 | return price_list
67 |
68 |
69 | class NLTask(ZiRoomTask):
70 | filters = "http://(\w+).ziroom.com/z/nl/\S+"
71 |
72 | def on_parse(self):
73 | """解析链接"""
74 | house_list = self.response.css.many('#houseList li')
75 | houses = []
76 | if self.response.css.one('.nomsg').default(None).text() is None:
77 | price_list = self.get_price(self.response)
78 | for house in house_list:
79 | houses.append(
80 | {
81 | 'price': price_list[len(houses)],
82 | 'href': house.one('.img a').attrs['href'][2:],
83 | 'img_src': house.one('.img a img').attrs['src'][2:],
84 | 'block': house.one('.img a img').attrs.get('alt', None),
85 | 'name': house.one('.txt h3 a').text(),
86 | 'site': house.one('.txt h4 a').text(),
87 | 'detail': house.one('.txt .detail').text(),
88 | 'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')]
89 | }
90 | )
91 | else:
92 | for house in house_list:
93 | houses.append(
94 | {
95 | 'price': re.search("(\d+)", house.one('.price').text())[0],
96 | 'href': house.one('.img a').attrs['href'][2:],
97 | 'img_src': house.one('.img a img').attrs['src'][2:],
98 | 'block': house.one('.img a img').attrs.get('alt', None),
99 | 'name': house.one('.txt h3 a').text(),
100 | 'site': house.one('.txt h4 a').text(),
101 | 'detail': house.one('.txt .detail').text(),
102 | 'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')]
103 | }
104 | )
105 | return houses
106 |
107 | def on_link(self):
108 | paths = list(set(self.response.re.many('\w+.ziroom.com/z/nl/\S+?.html\\??p?=?\d*')))
109 | return {
110 | 2: [f'http://{house["href"]}' for house in self.result],
111 | 4: [f'http://{path}' for path in paths]
112 | }
113 |
114 | def on_save(self):
115 | self.logger.info(f'抓到房源列表 {self.result}')
116 |
117 |
118 | class VRTask(ZiRoomTask):
119 | filters = "http://(\w+).ziroom.com/z/vr/(\w+)"
120 |
121 | def on_parse(self):
122 | detail_room = {}
123 | for i in self.response.css.many(".detail_room li"):
124 | detail = re.sub('\s', '', i.text()).split(':')
125 | detail_room[detail[0]] = detail[1]
126 | info = {
127 | 'img': [img.attrs['src'] for img in self.response.css.many('.lidiv img')],
128 | 'room_name': self.response.css.one('.room_name h2').default(None).text(),
129 | 'ellipsis': ' '.join(filter(
130 | lambda x: x, self.response.css.one('.room_detail_right .ellipsis').text().split())),
131 | 'room_id': self.response.css.one('#room_id').attrs.get("value"),
132 | 'house_id': self.response.css.one('#house_id').attrs.get("value"),
133 | 'current_city_code': self.response.css.one('#current_city_code').attrs.get('value'),
134 | 'detail_room': detail_room,
135 | 'number': self.response.css.one('.aboutRoom h3').text()[3:],
136 | 'periphery': self.response.css.many('.aboutRoom p')[0].text()[3:],
137 | 'traffic': self.response.css.many('.aboutRoom p')[1].text()[3:]
138 | }
139 | roommate = []
140 | for i in self.response.css.many('.greatRoommate li'):
141 | roommate.append({
142 | '性别': i.attrs.get("class")[0],
143 | '房间号': i.one('.user_top p').text(),
144 | '星座': i.one('.sign').text()[0:-2],
145 | '职业': i.one('.jobs').text()[0:-2],
146 | '入住时间': i.one('.user_bottom p').text(),
147 | })
148 | info['roommate'] = roommate
149 | conf = self.client.get(
150 | url=f"http://www.ziroom.com/detail/config?house_id={info['house_id']}&id={info['room_id']}",
151 | headers={
152 | "User-Agent": self.ua.chrome
153 | }
154 | )
155 | configuration = []
156 | for i in conf.json['data']:
157 | if conf.json['data'].get(i) == 1:
158 | configuration.append(i)
159 | info['configuration'] = configuration
160 | cookies = self.response.cookies.get_dict()
161 | for cookie in cookies:
162 | if 'nlist' in cookie:
163 | info['price'] = json.loads(urllib.parse.unquote(
164 | self.response.cookies.get_dict()[cookie]))[info["room_id"]]['sell_price']
165 | break
166 | return info
167 |
168 | def on_save(self):
169 | self.logger.info(f'抓到房源信息 {self.result}')
170 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuo/tasks.py:
--------------------------------------------------------------------------------
1 | import re
2 | import random
3 | import string
4 | import datetime
5 | from pyloom import tasks
6 | from pyloom.errors import *
7 |
8 |
9 | def get_list_id(opt_id):
10 | """返回list_id:(opt_id)_(10位随机字符串)"""
11 | return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10))
12 |
13 |
14 | class PinDuoDuoTask(tasks.Task):
15 | """搜索栏"""
16 | _redis = None
17 | goods_url = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/"
18 | operation_url = "http://apiv4.yangkeduo.com/v4/operation/"
19 |
20 | @tasks.retry(tries=5, backoff=0)
21 | def on_download(self):
22 | """下载链接"""
23 | try:
24 | resp = self.client.get(
25 | url=self.url,
26 | headers={
27 | "User-Agent": self.ua.android,
28 | "Referer": "Android",
29 | "Host": "apiv4.yangkeduo.com"
30 | }
31 | )
32 | except ProxyError:
33 | self.client.reload_proxy()
34 | raise RetryError
35 | except RequestError:
36 | raise RetryError
37 |
38 | try:
39 | if "error_code" in resp.json:
40 | error_code = resp.json.get('error_code', None)
41 | else:
42 | error_code = None
43 | except JSONDecodeError:
44 | error_code = None
45 |
46 | if error_code == 40001 or resp.status_code == 503 or resp.status_code == 504:
47 | self.client.reuse_proxy()
48 | raise RetryError
49 |
50 | if resp.status_code == 403 or resp.status_code == 429:
51 | self.client.reload_proxy()
52 | raise RetryError
53 | else:
54 | self.client.reuse_proxy()
55 | return resp
56 |
57 |
58 | class HomeOperationTask(PinDuoDuoTask):
59 | """搜索栏"""
60 | filters = "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations\?pdduid="
61 |
62 | def on_parse(self):
63 | targets = []
64 | for childrens in self.response.json:
65 | targets.append(childrens["id"])
66 | for children in childrens["children"]:
67 | targets.append(children["id"])
68 | return targets
69 |
70 | def on_link(self):
71 | return {
72 | 4: [f"{self.operation_url}{opt_id}/groups?opt_type=2&size=50&offset=0&list_id={get_list_id(opt_id)}&pdduid="
73 | for opt_id in self.result]
74 | }
75 |
76 |
77 | class OperationTask(PinDuoDuoTask):
78 | """分类商品结果"""
79 | filters = "http://apiv4.yangkeduo.com/v4/operation/(\w+)"
80 |
81 | def on_parse(self):
82 | goods = []
83 | for good in self.response.json["goods_list"]:
84 | goods.append(
85 | {
86 | "goods_id": good["goods_id"],
87 | "goods_name": good["goods_name"],
88 | "thumb_url": good["thumb_url"],
89 | "cnt": good["cnt"],
90 | "normal_price": good["normal_price"],
91 | "market_price": good["market_price"],
92 | "price": good["group"]["price"],
93 | "updated_at": str(datetime.datetime.now())
94 | }
95 | )
96 | operation = {
97 | "goods_id": [good["goods_id"] for good in goods],
98 | "opt_infos": self.response.json["opt_infos"],
99 | "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1],
100 | "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1],
101 | "flip": self.response.json["flip"],
102 | "next_offset": str(self.response.json["flip"]).split(";")[-1]
103 | }
104 | return goods, operation
105 |
106 | def on_link(self):
107 | goods, operation = self.result
108 |
109 | goods_list = [f"{self.goods_url}{goods_id}?goods_id={goods_id}&from=0&pdduid="
110 | for goods_id in operation["goods_id"]]
111 | operation_list = [f'{self.operation_url}{opt_infos["id"]}/groups?opt_type=2&size=50&offset=0'
112 | f'&list_id={get_list_id(opt_infos["id"])}&pdduid='
113 | for opt_infos in operation["opt_infos"]]
114 |
115 | if operation["flip"] is not None:
116 | operation_list.append(f'{self.operation_url}{operation["opt_id"]}/groups?opt_type=2&size=50&offset='
117 | f'{operation["next_offset"]}&list_id={operation["list_id"]}'
118 | f'&flip={operation["flip"]}&pdduid=')
119 | return {
120 | 2: goods_list,
121 | 4: operation_list
122 | }
123 |
124 | def on_save(self):
125 | self.logger.info(f'抓到商品列表 {self.result[0]}')
126 |
127 |
128 | class GoodsTask(PinDuoDuoTask):
129 | """商品详情接口"""
130 | filters = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/(\w+)"
131 |
132 | def on_parse(self):
133 | _sku = self.response.json["sku"]
134 | goods_info = {
135 | "goods_sn": self.response.json["goods_sn"],
136 | "goods_id": self.response.json["goods_id"],
137 | "cat_id": self.response.json["cat_id"],
138 | "goods_name": self.response.json["goods_name"],
139 | "goods_desc": self.response.json["goods_desc"],
140 | "market_price": self.response.json["market_price"],
141 | "is_onsale": self.response.json["is_onsale"],
142 | "thumb_url": self.response.json["thumb_url"],
143 | "hd_thumb_url": self.response.json["hd_thumb_url"],
144 | "image_url": self.response.json["image_url"],
145 | "goods_type": self.response.json["goods_type"],
146 | "gallery": [{"id": gallery["id"], "url":gallery["url"]} for gallery in self.response.json["gallery"]],
147 | "created_at": self.response.json["created_at"],
148 | "sales": self.response.json["sales"],
149 | "price": {
150 | "min_on_sale_group_price": self.response.json["min_on_sale_group_price"],
151 | "max_on_sale_group_price": self.response.json["max_on_sale_group_price"],
152 | "min_on_sale_normal_price": self.response.json["min_on_sale_normal_price"],
153 | "max_on_sale_normal_price": self.response.json["max_on_sale_normal_price"],
154 | "min_group_price": self.response.json["min_group_price"],
155 | "max_group_price": self.response.json["max_group_price"],
156 | "max_normal_price": self.response.json["max_normal_price"],
157 | "min_normal_price": self.response.json["min_normal_price"],
158 | "old_min_on_sale_group_price": self.response.json["old_min_on_sale_group_price"],
159 | "old_max_on_sale_group_price": self.response.json["old_max_on_sale_group_price"],
160 | "old_min_group_price": self.response.json["old_min_group_price"],
161 | "old_max_group_price": self.response.json["old_max_group_price"]
162 | },
163 | "cat_id_list": [self.response.json["cat_id_1"],
164 | self.response.json["cat_id_2"],
165 | self.response.json["cat_id_3"]]
166 | }
167 | sku = []
168 | for sku_list in _sku:
169 | sku.append({
170 | "sku_id": sku_list["sku_id"],
171 | "goods_id": sku_list["goods_id"],
172 | "thumb_url": sku_list["thumb_url"],
173 | "quantity": sku_list["quantity"],
174 | "normal_price": sku_list["normal_price"],
175 | "group_price": sku_list["group_price"],
176 | "old_group_price": sku_list["old_group_price"],
177 | "specs": sku_list["specs"]
178 | })
179 | goods_info["sku"] = sku
180 | return goods_info
181 |
182 | def on_save(self):
183 | self.logger.info(f'抓到商品信息 {self.result}')
184 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/tasks.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import execjs
4 | import string
5 | import datetime
6 | import re
7 | import os
8 | from pyloom import tasks
9 | from pyloom.errors import *
10 |
11 |
12 | def get_list_id(opt_id):
13 | """返回list_id:(opt_id)_(10位随机字符串)"""
14 | return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10))
15 |
16 |
17 | def get_anti_content(ua):
18 | path = os.path.abspath(os.path.dirname(__file__))
19 | with open(os.path.join(path, 'get_anticontent.js'), 'r', encoding='utf-8') as f:
20 | js = f.read()
21 | ctx = execjs.compile(js)
22 | url = "https://mobile.yangkeduo.com/catgoods.html"
23 | return ctx.call('get_anti', url, ua)
24 |
25 |
26 | class SearchTask(tasks.Task):
27 | filters = "https://mobile.yangkeduo.com/classification.html"
28 |
29 | @tasks.retry()
30 | def on_download(self):
31 | try:
32 | ua = self.ua.chrome # 随机获取ua
33 | resp = self.client.get(
34 | url=self.url,
35 | headers={
36 | "User-Agent": ua,
37 | 'authority': 'mobile.yangkeduo.com',
38 | 'pragma': 'no-cache',
39 | 'cache-control': 'no-cache',
40 | 'upgrade-insecure-requests': '1',
41 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
42 | 'application/signed-exchange;v=b3',
43 | 'accept-encoding': 'gzip, deflate, br',
44 | 'accept-language': 'zh-CN,zh;q=0.9'
45 | }
46 | )
47 | except ProxyError:
48 | self.client.reload_proxy()
49 | raise RetryError
50 | except RequestError:
51 | raise RetryError
52 | return resp
53 |
54 | def on_parse(self):
55 | data = json.loads(self.response.re.many("__NEXT_DATA__.*?__NEXT_LOADED_PAGES")[0][16:-20])
56 | result = []
57 | for i in data['props']['pageProps']['data']['operationsData']['detailData']:
58 | for j in i['cat']:
59 | result.append(f'https://mobile.yangkeduo.com/proxy/api/v4/operation/{j["optID"]}/groups'
60 | f'?offset=0&size=100&opt_type=2&sort_type=DEFAULT&list_id={get_list_id(j["optID"])}'
61 | f'&pdduid=0')
62 | return result
63 |
64 | def on_link(self):
65 | """解析url,并添加到队列"""
66 | return {
67 | 4: self.result
68 | }
69 |
70 |
71 | class ListTask(tasks.Task):
72 | filters = "https://mobile.yangkeduo.com/proxy/api/v4/operation/(\w+)"
73 |
74 | @tasks.retry()
75 | def on_download(self):
76 | try:
77 | ua = self.ua.chrome # 随机获取ua
78 | url = self.url + f"&anti_content={get_anti_content(ua)}"
79 | resp = self.client.get(
80 | url=url,
81 | headers={
82 | "User-Agent": ua,
83 | 'authority': 'mobile.yangkeduo.com',
84 | 'pragma': 'no-cache',
85 | 'cache-control': 'no-cache',
86 | 'upgrade-insecure-requests': '1',
87 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
88 | 'application/signed-exchange;v=b3',
89 | 'accept-encoding': 'gzip, deflate, br',
90 | 'accept-language': 'zh-CN,zh;q=0.9'
91 | }
92 | )
93 | except ProxyError:
94 | self.client.reload_proxy()
95 | raise RetryError
96 | except RequestError:
97 | raise RetryError
98 |
99 | return resp
100 |
101 | def on_parse(self):
102 | goods = []
103 | for good in self.response.json["goods_list"]:
104 | goods.append(
105 | {
106 | "thumb_url": good["thumb_url"],
107 | "country": good["country"],
108 | "goods_name": good["goods_name"],
109 | "short_name": good["short_name"],
110 | "sales_tip": good["sales_tip"],
111 | "cnt": good["cnt"],
112 | "goods_id": good["goods_id"],
113 | "hd_thumb_url": good["hd_thumb_url"],
114 | "hd_url": good["hd_url"],
115 | "normal_price": good["normal_price"],
116 | "market_price": good["market_price"],
117 | "price": good["group"]["price"],
118 | "link_url": good["link_url"],
119 | "mall_name": good.get('mall_name'),
120 | "tag": [i["text"] for i in good["tag_list"]],
121 | "updated_at": str(datetime.datetime.now())
122 | }
123 | )
124 | operation = {
125 | "link_url": [good["link_url"] for good in goods],
126 | "opt_infos": self.response.json["opt_infos"],
127 | "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1],
128 | "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1],
129 | "flip": self.response.json["flip"],
130 | "next_offset": str(self.response.json["flip"]).split(";")[0]
131 | }
132 | return goods, operation
133 |
134 | def on_link(self):
135 | url = "https://mobile.yangkeduo.com/"
136 |
137 | goods, operation = self.result
138 | goods_list = [f'{url}{link_url}' for link_url in operation["link_url"]]
139 | operation_list = [f'{url}/proxy/api/v4/operation/{opt["id"]}/groups?offset=0&size=100&opt_type=2'
140 | f'&sort_type=DEFAULT&list_id={get_list_id(opt["id"])}&pdduid=0'
141 | for opt in operation["opt_infos"]]
142 | if operation["flip"] is not None:
143 | operation_list.append(f'{url}/proxy/api/v4/operation/{operation["opt_id"]}/groups?opt_type=2&size=100'
144 | f'&offset={operation["next_offset"]}&list_id={operation["list_id"]}'
145 | f'&flip={operation["flip"]}&pdduid=0')
146 | self.logger.debug(goods_list)
147 | self.logger.debug(operation_list)
148 | return {
149 | 1: goods_list,
150 | 4: operation_list
151 | }
152 |
153 | def on_save(self):
154 | self.logger.debug(self.result[0])
155 |
156 |
157 | class GoodsTask(tasks.Task):
158 | filters = "https://mobile.yangkeduo.com/goods.html"
159 |
160 | @tasks.retry()
161 | def on_download(self):
162 | try:
163 | resp = self.client.get(
164 | url=self.url,
165 | headers={
166 | "User-Agent": self.ua.chrome,
167 | 'authority': 'mobile.yangkeduo.com',
168 | 'pragma': 'no-cache',
169 | 'cache-control': 'no-cache',
170 | 'upgrade-insecure-requests': '1',
171 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
172 | 'application/signed-exchange;v=b3',
173 | 'accept-encoding': 'gzip, deflate, br',
174 | 'accept-language': 'zh-CN,zh;q=0.9'
175 | }
176 | )
177 | except ProxyError:
178 | self.client.reload_proxy()
179 | raise RetryError
180 | except RequestError:
181 | raise RetryError
182 | self.logger.debug(resp.status_code)
183 |
184 | if re.search('"initDataObj":{"needLogin":true}', resp.text) is not None:
185 | raise RetryError
186 | return resp
187 |
188 | def on_parse(self):
189 | data = json.loads(self.response.re.many("window.rawData=.*?}};")[0][15:-1])
190 | return {
191 | "goods": data["store"]["initDataObj"]["goods"],
192 | "mall": data["store"]["initDataObj"]["mall"],
193 | "reviews": data["store"]["initDataObj"]["reviews"],
194 | }
195 |
--------------------------------------------------------------------------------
/spiders/WeiBo/README.md:
--------------------------------------------------------------------------------
1 | ## 微博爬虫
2 |
3 | ### 用户信息(UserTask)
4 |
5 | | 字段 | 示例 | 说明 |
6 | | ------------------------ | ---------------------------------- | ---------------------------------- |
7 | | result.uid | 1680938527 | 用户唯一标识 |
8 | | result.screen_name | 作恶太妖精 | 用户昵称 |
9 | | result.statuses_count | 12275 | 微博数量 |
10 | | result.verified_type | -1 | 账号类型,见[附录](#verified_type) |
11 | | result.verified_type_ext | -1 | 附加账号类型,-1:无 1:橙V 0:金V |
12 | | result.description | 因为追求梦想而伟大!梦想是熬出来的 | 简介 |
13 | | result.gender | f | 性别,f:女 m:男 |
14 | | result.mbtype | 0 | 未知 |
15 | | result.urank | 35 | 账号等级 |
16 | | result.mbrank | 0 | 会员等级 |
17 | | result.followers_count | 754 | 粉丝数量 |
18 | | result.follow_count | 602 | 关注数量 |
19 | | result.profile_image_id | 6431161fjw1e8qgp5bmzyj2050050aa8 | 头像图片号 |
20 | | result.status | 0 | 账号状态,-1:不可用 0:可用 |
21 | | result.updated_at | 2018-08-10 00:02:02 | 抓取时间 |
22 |
23 | ```python
24 | self.result = {
25 | 'uid': 2554193671,
26 | 'screen_name': '黑镜头世界',
27 | 'statuses_count': 88,
28 | 'verified_type': -1,
29 | 'verified_type_ext': -1,
30 | 'description': '一张残旧的老照片,能给你带来灌顶的震撼~',
31 | 'gender': 'm',
32 | 'mbtype': 0,
33 | 'urank': 2,
34 | 'mbrank': 0,
35 | 'followers_count': 84,
36 | 'follow_count': 4,
37 | 'profile_image_id': '983de707jw1e8qgp5bmzyj2050050aa8',
38 | 'status': 0,
39 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 10, 231390)
40 | }
41 | ```
42 |
43 |
44 |
45 | ### 原创微博(UserTask)
46 |
47 | 每个用户前10条微博中的原创微博
48 |
49 | | 字段 | 示例 | 说明 |
50 | | ---------------------- | ------------------------------------ | ------------ |
51 | | result.mid | 4264355334054790 | 微博唯一标识 |
52 | | result.uid | 1225419417 | 用户唯一标识 |
53 | | result.text | 哇!抽到了!爱国宝 | 微博正文 |
54 | | result.reposts_count | 14 | 转发数量 |
55 | | result.comments_count | 114 | 评论数量 |
56 | | result.attitudes_count | 1481 | 点赞数量 |
57 | | result.source | iPhone X | 来源 |
58 | | result.updated_at | 2018-08-10 00:02:09 | 抓取时间 |
59 | | result.created_at | 2018-07-21 22:56:41 | 发表时间 |
60 | | result.images | ["490a6a99gy1fthvjguf0gj20v91voqbr"] | 图片列表 |
61 | | result.is_long_text | False | 是否为长微博 |
62 |
63 | ```python
64 | self.result = [
65 | {
66 | 'mid': 4278823505781372,
67 | 'uid': 2094949595,
68 | 'text': '杭州的绿水青山留下了许多诗句,和风熏,杨柳轻,郁郁青山江水平,笑语满香径。什么使你爱上了这座城市?{网页链接}(https://weibo.com/tv/v/Gw6iL1Q0e?fid=1034:4276507087207862) \u200b',
69 | 'reposts_count': 1,
70 | 'comments_count': 1,
71 | 'attitudes_count': 2,
72 | 'source': '微博 weibo.com',
73 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904636),
74 | 'created_at': datetime.datetime(2018, 8, 30, 21, 8, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
75 | 'images': [],
76 | 'is_long_text': False
77 | },
78 | {
79 | 'mid': 4278785248875113,
80 | 'uid': 2094949595,
81 | 'text': '你当时学的专业是什么?你现在又在做什么工作呢? \u200b',
82 | 'reposts_count': 0,
83 | 'comments_count': 12,
84 | 'attitudes_count': 1,
85 | 'source': '微博 weibo.com',
86 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904846),
87 | 'created_at': datetime.datetime(2018, 8, 30, 18, 36, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
88 | 'images': ['7cde64dbgy1furl2c240jj20e80cujs2'],
89 | 'is_long_text': False
90 | },
91 | ]
92 |
93 | ```
94 |
95 |
96 |
97 | ### 转发微博(UserTask)
98 |
99 | 每个用户前10条微博中的转发微博
100 |
101 | | 字段 | 示例 | 说明 |
102 | | ---------------------- | ---------------------- | ------------------------------- |
103 | | result.mid | 4269756171586532 | 微博唯一标识 |
104 | | result.uid | 1680938527 | 用户唯一标识 |
105 | | result.text | //@李宇春如初:转发微博 | 微博正文 |
106 | | result.reposts_count | 0 | 转发数量 |
107 | | result.comments_count | 0 | 评论数量 |
108 | | result.attitudes_count | 0 | 点赞数量 |
109 | | result.source | iPhone客户端 | 来源 |
110 | | result.pmid | 4269752379437757 | 父级微博的mid(上层转发,可空) |
111 | | result.smid | 4269748974496983 | 源微博的mid(原创微博) |
112 | | result.suid | 5427461387 | 源微博的uid |
113 | | result.updated_at | 2018-08-10 00:02:02 | 抓取时间 |
114 | | result.created_at | 2018-08-05 20:37:42 | 发表时间 |
115 |
116 | ```python
117 | self.result = [
118 | {'mid': 4278871820165470,
119 | 'uid': 1802393212,
120 | 'text': '这壁纸超萌哦,喜欢就快来打call @Line壁纸酱',
121 | 'reposts_count': 0,
122 | 'comments_count': 0,
123 | 'attitudes_count': 2,
124 | 'source': '皮皮时光机',
125 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250057),
126 | 'created_at': datetime.datetime(2018, 8, 31, 0, 20, 2, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
127 | 'pmid': 0,
128 | 'smid': 4278723350035431,
129 | 'suid': 6150916523
130 | },
131 | {
132 | 'mid': 4278866795185761,
133 | 'uid': 1802393212,
134 | 'text': '[心]',
135 | 'reposts_count': 0,
136 | 'comments_count': 0,
137 | 'attitudes_count': 2,
138 | 'source': '皮皮时光机',
139 | 'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250450),
140 | 'created_at': datetime.datetime(2018, 8, 31, 0, 0, 4, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
141 | 'pmid': 0,
142 | 'smid': 4266013078506248,
143 | 'suid': 5604000425}
144 | ]
145 | ```
146 |
147 |
148 |
149 | ### 关注列表(FollowerTask)
150 |
151 | 每个用户最后180个关注、部分大V关注
152 |
153 | | 字段 | 示例 | 说明 |
154 | | ------ | ------------------------ | ----------------------- |
155 | | result | [5427461387, 1680938527] | 关注列表中所有用户的uid |
156 |
157 | ```python
158 | self.result = [
159 | 1199430302, 5291824241, 1744583555, 1225627080, 1192504311, 1539469391, 1831216671, 1855790127,
160 | ]
161 | ```
162 |
163 | + 通过self.uid获取当前用户UID
164 |
165 |
166 |
167 | ### 粉丝列表(FanTask)
168 |
169 | 每个用户最后4500个粉丝、部分大V粉丝
170 |
171 | | 字段 | 示例 | 说明 |
172 | | ------ | -------------------------- | ----------------------- |
173 | | result | [5427461387", "1680938527] | 粉丝列表中所有用户的uid |
174 |
175 | ```python
176 | self.result = [
177 | 2011541160, 6561198332, 5650361179, 5203386014, 6586203686, 3975892466, 5280555723, 6200526771,
178 | ]
179 | ```
180 |
181 | + 通过self.uid获取当前用户UID
182 |
183 |
184 |
185 | ### 附录
186 |
187 | #### verified_type
188 |
189 | |值|含义|
190 | |:---|:---|
191 | |-1|无认证|
192 | |0|个人认证|
193 | |1|政府|
194 | |2|企业|
195 | |3|媒体|
196 | |4|校园|
197 | |5|网站|
198 | |6|应用|
199 | |7|机构|
200 | |8|待审企业|
201 | |200|初级达人|
202 | |220|中高级达人|
203 | |400|已故V用户|
204 |
205 |
--------------------------------------------------------------------------------
/pyloom/buckets.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import fnmatch
4 | import threading
5 | from .errors import BucketError
6 | from redis import StrictRedis, exceptions
7 |
8 |
9 | class LocalBucket(object):
10 | """进程内存储,重启后数据丢失"""
11 | _lock = None
12 | _instances = {}
13 |
14 | def __init__(self):
15 | self._db = {}
16 | if LocalBucket._lock is None:
17 | LocalBucket._lock = threading.Lock()
18 |
19 | @classmethod
20 | def instance(cls, name):
21 | """获取单例"""
22 | var = LocalBucket._instances.get(name, None)
23 | if var:
24 | return var
25 | var = LocalBucket()
26 | LocalBucket._instances[name] = var
27 | return var
28 |
29 | @classmethod
30 | def purge(cls):
31 | """清理由instance创建的所有实例的过期key,返回被清理的数量"""
32 | count = 0
33 | for instance in cls._instances.values():
34 | count += instance._purge()
35 | return count
36 |
37 | def _purge(self):
38 | """清理实例中过期的key,返回被清理的数量"""
39 | keys = []
40 | for key, (_, expire_at) in self._db.items():
41 | if expire_at is not None and expire_at <= time.time():
42 | keys.append(key)
43 | for key in keys:
44 | del self._db[key]
45 | return len(keys)
46 |
47 | def set(self, key, value, ttl=None):
48 | """为key设置value,ttl秒后失效"""
49 | item = self._db.get(key, None)
50 | if item is None or ttl is not None:
51 | # 更改value和ttl
52 | if ttl is None:
53 | expire_at = None
54 | else:
55 | expire_at = time.time() + ttl
56 | self._db[key] = [value, expire_at]
57 | else:
58 | # 只更改ttl
59 | self._db[key][0] = value
60 |
61 | def delete(self, *keys) -> int:
62 | """删除一个或多个key,返回被删除的数量"""
63 | count = 0
64 | for key in keys:
65 | item = self._db.get(key, None)
66 | # 忽略不存在的key
67 | if item is None:
68 | continue
69 | expire_at = item[1]
70 | if expire_at is None or expire_at > time.time():
71 | del self._db[key]
72 | count += 1
73 | else: # 键已过期,不累加计数器
74 | del self._db[key]
75 | return count
76 |
77 | def get(self, key) -> object:
78 | """返回key的value,当key不存在时返回None"""
79 | item = self._db.get(key, None)
80 | if item is None:
81 | return None
82 | value, expire_at = item
83 | if expire_at is None:
84 | return value
85 | elif expire_at > time.time():
86 | return value
87 | else: # 键已过期
88 | del self._db[key]
89 | return None
90 |
91 | def getset(self, key, value) -> object:
92 | """为给定key设置新value,返回旧value"""
93 | old_value = self.get(key)
94 | self.set(key, value)
95 | return old_value
96 |
97 | def keys(self, pattern='*') -> list:
98 | """
99 | 返回满足pattern的所有键
100 | pattern支持通配符:?、*、[]
101 | """
102 | expired_keys = []
103 | valid_keys = []
104 | n = time.time()
105 | for key, (_, expire_at) in self._db.items():
106 | if expire_at is not None and expire_at <= n:
107 | expired_keys.append(key)
108 | else:
109 | if fnmatch.fnmatch(key, pattern):
110 | valid_keys.append(key)
111 | for key in expired_keys:
112 | del self._db[key]
113 | return valid_keys
114 |
115 | def expire(self, key, ttl) -> bool:
116 | """为给定key设置生存时间,ttl秒后被自动删除"""
117 | item = self._db.get(key, None)
118 | if item is None:
119 | return False
120 | _, expire_at = item
121 | if expire_at is None or expire_at >= time.time():
122 | self._db[key][1] = ttl + time.time()
123 | return True
124 | else: # 键已过期
125 | del self._db[key]
126 | return False
127 |
128 | def ttl(self, key) -> int:
129 | """
130 | 返回给定key的剩余生存时间
131 | Returns:
132 | 当key不存在时,返回-2;
133 | 当key存在但没有设置剩余生存时间时,返回-1;
134 | 否则,返回key的剩余生存时间
135 | """
136 | item = self._db.get(key, None)
137 | if item is None:
138 | return -2
139 | value, expire_at = item
140 | if expire_at is None:
141 | return -1
142 | elif expire_at > time.time():
143 | return expire_at - time.time()
144 | else: # 键已过期
145 | del self._db[key]
146 | return -2
147 |
148 | def incr(self, key, amount=1) -> int:
149 | """
150 | 将给定key的值加上amount,返回incr后的值
151 | 若key不存在,key被先初始化为0,再incr
152 | 若value非int型,抛出异常
153 | """
154 | with LocalBucket._lock:
155 | old_value = self.get(key)
156 | if old_value is None:
157 | self.set(key, 0, None)
158 | old_value = 0
159 | elif not isinstance(old_value, int):
160 | raise BucketError("incr应作用于int型的值")
161 | new_value = old_value + amount
162 | self.set(key, new_value)
163 | return new_value
164 |
165 |
166 | class ShareBucket(object):
167 | """共享存储,利用redis存储,不易失"""
168 | prefix = "bucket"
169 |
170 | def __init__(self, db: StrictRedis, name):
171 | self._db = db
172 | self.name = name
173 | self.key_prefix = f"{self.prefix}:{name}"
174 |
175 | def set(self, key, value, ttl=None):
176 | """为key设置value,ttl秒后失效"""
177 | self._db.set(f"{self.key_prefix}:{key}", json.dumps(value), ex=ttl)
178 |
179 | def delete(self, *keys) -> int:
180 | """删除一个或多个key,返回被删除的数量"""
181 | return self._db.delete(*[f"{self.key_prefix}:{k}" for k in keys])
182 |
183 | def get(self, key) -> object:
184 | """返回key的value,当key不存在时返回None"""
185 | res = self._db.get(f"{self.key_prefix}:{key}")
186 | if res:
187 | return json.loads(res)
188 | else:
189 | return res
190 |
191 | def getset(self, key, value) -> object:
192 | """为给定key设置新value,返回旧value"""
193 | res = self._db.getset(f"{self.key_prefix}:{key}", value)
194 | if res:
195 | return json.loads(res)
196 | else:
197 | return res
198 |
199 | def keys(self, pattern='*') -> list:
200 | """
201 | 返回满足pattern的所有键
202 | pattern支持通配符:?、*、[]
203 | """
204 | p = len(f"{self.key_prefix}:")
205 | res = self._db.keys(f"{self.key_prefix}:{pattern}")
206 | return [r.decode()[p:] for r in res]
207 |
208 | def expire(self, key, ttl) -> bool:
209 | """为给定key设置生存时间,ttl秒后被自动删除"""
210 | return self._db.expire(f"{self.key_prefix}:{key}", ttl)
211 |
212 | def ttl(self, key) -> int:
213 | """
214 | 返回给定key的剩余生存时间
215 | Returns:
216 | 当key不存在时,返回-2;
217 | 当key存在但没有设置剩余生存时间时,返回-1;
218 | 否则,返回key的剩余生存时间
219 | """
220 | return self._db.ttl(f"{self.key_prefix}:{key}")
221 |
222 | def incr(self, key, amount=1) -> int:
223 | """
224 | 将给定key的值加上amount,返回incr后的值
225 | 若key不存在,key被先初始化为0,再incr
226 | 若value非int型,抛出异常
227 | """
228 | try:
229 | return self._db.incr(f"{self.key_prefix}:{key}", amount)
230 | except exceptions.ResponseError as e:
231 | if e.args[0] == 'value is not an integer or out of range':
232 | raise BucketError("incr应作用于int型的值")
233 |
234 | def lpush(self, key, *values) -> int:
235 | """
236 | 将一个或多个值value插入到列表key的表头
237 | 返回执行LPUSH命令后,列表的长度。
238 | """
239 | return self._db.lpush(f"{self.key_prefix}:{key}", *values)
240 |
241 | def lrange(self, key, start, end) -> list:
242 | """
243 | 返回列表 key 中指定区间内的元素,区间以偏移量 start 和 stop 指定。
244 | 包含指定区间内的元素的list
245 | """
246 | return self._db.lrange(f"{self.key_prefix}:{key}", start, end)
247 |
248 | def lock(self, key, timeout, **kwargs):
249 | """分布式锁"""
250 | return self._db.lock(f"{self.key_prefix}:{key}", timeout, **kwargs)
251 |
--------------------------------------------------------------------------------
/spiders/LaGou/tasks.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 | import uuid
4 | import random
5 | import string
6 | import datetime
7 | from pyloom import tasks, errors
8 |
9 |
10 | class LaGouJobTask(tasks.Task):
11 | @staticmethod
12 | def get_random(const):
13 | return "".join(random.sample(string.ascii_letters + string.digits, const))
14 |
15 | @staticmethod
16 | def get_uuid():
17 | return time.strftime("%Y%m%d%H%M%S-", time.localtime()) + str(uuid.uuid1())
18 |
19 | def get_cookies(self):
20 | cookies = {
21 | 'LGUID': self.get_uuid(),
22 | 'user_trace_token': '20180705084851-8a154ee4-0f2b-406d-9130-e835805b49ee',
23 | 'X_HTTP_TOKEN': 'c2e6c0237f5362aca8d13748cfdd8274',
24 | 'JSESSIONID': self.get_random(47).upper(),
25 | 'SEARCH_ID': self.get_random(32).lower(),
26 | 'LGSID': self.get_uuid(),
27 | 'PRE_UTM': '',
28 | 'PRE_HOST': '',
29 | 'PRE_SITE': '',
30 | 'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com',
31 | 'LGRID': self.get_uuid()
32 | }
33 | return cookies
34 |
35 | @tasks.retry(5, 0)
36 | def on_download(self):
37 | """下载页面"""
38 | if self.buckets.local.get('cookies') is None:
39 | self.buckets.local.set('cookies', self.get_cookies())
40 | cookies = self.buckets.local.get('cookies')
41 | try:
42 | response = self.client.get(
43 | url=self.url,
44 | allow_redirects=False,
45 | headers={
46 | "User-Agent": self.ua.chrome,
47 | "Accept-Encoding": "gzip",
48 | "Host": "www.lagou.com",
49 | "Referer": "https://www.lagou.com/jobs/list_"
50 | },
51 | cookies=cookies
52 | )
53 | except errors.ProxyError:
54 | self.logger.info("代理错误")
55 | raise errors.RetryError
56 | except errors.RequestError:
57 | self.logger.info("请求错误")
58 | raise errors.RetryError
59 |
60 | if response.status_code == 301:
61 | # 页面被删除
62 | raise errors.TaskFinish
63 | elif response.status_code == 302:
64 | self.logger.info(f"网页被封")
65 | self.buckets.local.set('cookies', self.get_cookies())
66 | self.queue.freeze(5)
67 | raise errors.RetryError
68 | elif "页面加载中" in response.text or "错误网关" in response.text:
69 | raise errors.RetryError
70 | else:
71 | return response
72 |
73 |
74 | class JobDetails(LaGouJobTask):
75 | """职位详情页面"""
76 | filters = "https://www.lagou.com/jobs/(\d+).html"
77 |
78 | def on_parse(self):
79 | """提取数据"""
80 | try:
81 | publish_time = self.response.css.one(".publish_time").text()[0:-8]
82 | except errors.TaskError as e:
83 | return
84 |
85 | if re.match("(\d+):(\d+)", publish_time) is not None:
86 | publish_time = time.strftime("%Y-%m-%d", time.localtime())
87 | elif re.match("(\d+)天前", publish_time):
88 | publish_time = (datetime.date.today() -
89 | datetime.timedelta(days=int(publish_time[0]))).strftime('%Y-%m-%d')
90 | status = 0 if self.response.css.one(".send-CV-btn").text() == "投个简历" else 1
91 | result = {
92 | "_id": re.search("(\d+)", self.url).group(0),
93 | "title": self.response.css.one(".job-name > .name").text(),
94 | "label": [label.text() for label in self.response.css.many(".labels")],
95 | "job_request": "".join(request.text() for request in self.response.css.many(".job_request > p > span")),
96 | "advantage": self.response.css.one(".job-advantage > p").text(),
97 | "job_bt": self.response.css.one(".job_bt").text(),
98 | "work_addr": self.response.css.one(".work_addr").text()[0:-8],
99 | "status": status,
100 | "job_company": self.response.css.one("#job_company > dt > a > img").attrs["alt"],
101 | "type": self.response.css.one(".c_feature > li").text(),
102 | "time": publish_time
103 | }
104 | return result
105 |
106 | def on_link(self):
107 | """提取链接"""
108 | job_urls = []
109 | max_id = self.buckets.share.get("max_id") or 4913130 # 爬虫的最大URL
110 | use_id = self.buckets.share.get("use_id") or -1 # 当前使用的最大URL
111 | waiting_url_const = self.queue.detail["waiting"][1] # 当前职位等待队列中的URL数量
112 |
113 | if use_id >= max_id:
114 | self.queue.interval = 3600
115 |
116 | if waiting_url_const <= 2000:
117 | # 当等待队列中的URL数量少于1000时,添加URL到等待队列中
118 | start_id = use_id + 1
119 | end_id = use_id + 2000 if (use_id + 2000) < max_id else max_id
120 | for path in range(start_id, end_id):
121 | job_urls.append(f"https://www.lagou.com/jobs/{path}.html")
122 | self.buckets.share.set("use_id", use_id + 2000)
123 |
124 | gongsi_urls = [self.response.css.one("#job_company > dt > a").attrs.get("href", None)]
125 | return {
126 | 0: gongsi_urls,
127 | 1: job_urls
128 | }
129 |
130 | def on_save(self):
131 | """保存数据"""
132 | self.logger.info(f"抓到职位信息 {self.result}")
133 |
134 |
135 | class GongSiDetails(LaGouJobTask):
136 | """公司页面详情信息"""
137 | filters = "https://www.lagou.com/gongsi/(\d+).html"
138 |
139 | def on_parse(self):
140 | result = {
141 | "_id": re.search("(\d+)", self.url).group(0),
142 | "company_abbr": self.response.css.one(".hovertips").text(),
143 | "company_full_name": self.response.css.one(".hovertips").attrs["title"],
144 | "type": self.response.css.one(".type + span").text(),
145 | "process": self.response.css.one(".process + span").text(),
146 | "number": self.response.css.one(".number + span").text(),
147 | "address": self.response.css.one(".address + span").default(None).text(),
148 | "label": [label.text() for label in self.response.css.many(".con_ul_li")],
149 | "website": self.response.css.one(".hovertips").attrs.get("href", None)
150 | }
151 | return result
152 |
153 | def on_save(self):
154 | self.logger.info(f"抓到公司信息 {self.result}")
155 |
156 |
157 | class JobsList(LaGouJobTask):
158 | """工作列表,用于增量拉取职位信息"""
159 | filters = "https://www.lagou.com/jobs/positionAjax.json?(\w+)"
160 |
161 | @tasks.retry(-1, 0)
162 | def on_download(self):
163 | try:
164 | response = self.client.get(
165 | url=self.url,
166 | allow_redirects=False,
167 | headers={
168 | "User-Agent": self.ua.chrome,
169 | "DNT": "1",
170 | "Host": "www.lagou.com",
171 | "Origin": "https://www.lagou.com",
172 | "Referer": "https://www.lagou.com/jobs/list_",
173 | "X-Anit-Forge-Code": "0",
174 | "X-Anit-Forge-Token": None,
175 | "X-Requested-With": "XMLHttpRequest"
176 | }
177 | )
178 | except (errors.ProxyError, errors.RequestError):
179 | raise errors.RetryError
180 |
181 | if response.json['success'] is False:
182 | self.logger.error(f"列表页发现最新URL出现错误,速率过快")
183 | raise errors.TaskBreak
184 | else:
185 | return response
186 |
187 | def on_parse(self):
188 | """提取信息"""
189 | old_max_id = self.buckets.share.get("max_id") or 0
190 | new_max_id = self.response.json['content']['positionResult']['result'][0]['positionId']
191 | if old_max_id < new_max_id:
192 | self.buckets.share.set("max_id", new_max_id)
193 | self.queue.interval = 0.01
194 | return [f"https://www.lagou.com/jobs/{new_max_id}.html"]
195 | else:
196 | return []
197 |
198 | def on_link(self):
199 | """提取链接"""
200 | return {
201 | 1: self.result,
202 | 2: [f"https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false&T={time.time()}"]
203 | }
204 |
--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/README.md:
--------------------------------------------------------------------------------
1 | ## 拼多多爬虫网页版
2 |
3 | #### 分类商品列表(ListTask)
4 | 搜索栏分类商品列表
5 |
6 | | 字段 | 示例 | 说明 |
7 | | ------------------- | ------------------------------------------------------------ | ------------ |
8 | | result.thumb_url | https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg | 商品图片链接 |
9 | | result.country | | 国度 |
10 | | result.goods_name | vivo原装耳机x21 x20... | 商品名称 |
11 | | result.short_name | vivo原装耳机x21 x20... | 商品简称 |
12 | | result.sales_tip | 已拼1490件 | 商品销售提示 |
13 | | result.goods_id | 6636323997 | 商品id |
14 | | result.cnt | 545 | 已售数量 |
15 | | result.normal_price | 2500 | 商品售价 |
16 | | result.market_price | 9900 | 商品标价 |
17 | | result.price | 1280 | 商品拼团价 |
18 | | result.link_url | goods.html?goods_id=6636323997&gallery_id=103375816423 | 商品详情连接 |
19 | | result.mall_name | | 商品店铺名称 |
20 | | result.tag | ['极速退款'] | 商品标签 |
21 | | result.updated_at | 2018-09-02 13:58:08.176553 | 爬取时间 |
22 |
23 | ```python
24 | self.result = [
25 | {
26 | 'thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg',
27 | 'country': '',
28 | 'goods_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话',
29 | 'short_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话',
30 | 'sales_tip': '已拼1490件',
31 | 'cnt': 1490,
32 | 'goods_id': 6636323997,
33 | 'hd_thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg',
34 | 'hd_url': '',
35 | 'normal_price': 1480,
36 | 'market_price': 9900,
37 | 'price': 1280,
38 | 'link_url': 'goods.html?goods_id=6636323997&gallery_id=103375816423',
39 | 'mall_name': None,
40 | 'tag': ['极速退款'],
41 | 'updated_at': '2019-04-15 23:05:57.603136'
42 | },
43 | ...
44 | ]
45 | ```
46 |
47 | ### 商品详情(GoodsTask
48 |
49 | ```python
50 | self.result = {
51 | 'goods': {
52 | 'serverTime': 1555340763,
53 | 'serverTimeTen': 15553407630,
54 | 'allowedRegions': '2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,22,23,24,25,26,27,30,31,32',
55 | 'catID': 5794,
56 | 'country': '',
57 | 'warehouse': '',
58 | 'goodsDesc': '如果你还再为佩戴迷你型双耳容...',
59 | 'goodsID': 2058994703,
60 | 'goodsName': '防水超小无线蓝牙耳机双耳5.0跑步运动一对迷你vivo入耳oppo耳机',
61 | 'shareDesc': '如果你还再为佩戴迷你型双耳容...',
62 | 'goodsType': 1,
63 | 'localGroups': [],
64 | 'hasLocalGroup': 1,
65 | 'bannerHeight': 375,
66 | 'topGallery': [
67 | '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/strip%7CimageView2/2/w/1300/q/80',
68 | ...
69 | ],
70 | 'viewImageData': [
71 | '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/quality/70',
72 | ...
73 | ],
74 | 'detailGallery': [
75 | {'url': '//t00img.yangkeduo.com/t09img/images/2018-07-03/84f1bc3741b3182df6f9d4dae633c9ec.jpeg?imageMogr2/quality/70', 'width': 790, 'height': 790},
76 | ...
77 | ],
78 | 'videoGallery': [],
79 | 'hasLiveGallery': False,
80 | 'descVideoGallery': [],
81 | 'mallID': 17984,
82 | 'groupTypes': [
83 | {'requireNum': '1', 'price': '0', 'totalPrice': '0', 'groupID': 2960548556, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999},
84 | {'requireNum': '2', 'price': '0', 'totalPrice': '0', 'groupID': 2960548557, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999}
85 | ],
86 | 'skus': [
87 | {
88 | 'skuID': 38010790017,
89 | 'quantity': 158,
90 | 'initQuantity': 0,
91 | 'isOnSale': 1,
92 | 'soldQuantity': 0,
93 | 'specs': [
94 | {'spec_key': '颜色', 'spec_value': '黑色-支持双耳通话', 'spec_key_id': 1215, 'spec_value_id': 843019793}
95 | ],
96 | 'thumbUrl': '//t00img.yangkeduo.com/t09img/images/2018-07-03/047554b0d2cd49183b5b2ed2380c528a.jpeg',
97 | 'limitQuantity': 999999,
98 | 'normalPrice': '218',
99 | 'groupPrice': '162',
100 | 'oldGroupPrice': '198',
101 | 'skuExpansionPrice': '0',
102 | 'unselectGroupPrice': '0'
103 | },
104 | ...
105 | ],
106 | 'thumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/a5d77844f14e6438fe196b0d08fd9c63.jpeg',
107 | 'hdThumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/7869f190bfd614a9d3151316f02642a1.jpeg',
108 | 'eventType': 0,
109 | 'isOnSale': True,
110 | 'isGoodsOnSale': True,
111 | 'isSkuOnSale': True,
112 | 'freeCoupon': [],
113 | 'isApp': 0,
114 | 'isFreshmanApp': 0,
115 | 'sideSalesTip': '已拼47件',
116 | 'bottomSalesTip': '',
117 | 'hasAddress': False,
118 | 'catID1': 5752,
119 | 'catID2': 5793,
120 | 'catID3': 5794,
121 | 'catID4': 0,
122 | 'eventComing': False,
123 | 'isMutiGroup': False,
124 | 'isNewUserGroup': False,
125 | 'isSpike': False,
126 | 'isTodaySpike': False,
127 | 'isTomorrowSpike': False,
128 | 'activity': {
129 | 'activityID': 11464199,
130 | 'activityType': 101,
131 | 'startTime': 1554825600,
132 | 'endTime': 1555775999
133 | },
134 | 'isGroupFree': False,
135 | 'isSpikeComing': False,
136 | 'overseaType': 0,
137 | 'isHaitao': False,
138 | 'isAppNewerJoinGroup': False,
139 | 'countryLogo': '',
140 | 'gpv': None,
141 | 'quickRefund': False,
142 | 'rv': True,
143 | 'maxNormalPrice': '218',
144 | 'minNormalPrice': '218',
145 | 'maxGroupPrice': '162',
146 | 'minGroupPrice': '162',
147 | 'maxOnSaleGroupPrice': '162',
148 | 'minOnSaleGroupPrice': '162',
149 | 'maxOnSaleGroupPriceInCent': 16200,
150 | 'minOnSaleGroupPriceInCent': 16200,
151 | 'maxOnSaleNormalPrice': '218',
152 | 'minOnSaleNormalPrice': '218',
153 | 'minTotalGroupPrice': '324',
154 | 'oldMinOnSaleGroupPriceInCent': 19800,
155 | 'unselectMinGroupPrice': '0',
156 | 'unselectMaxGroupPrice': '0',
157 | 'skipGoodsIDs': ['0'],
158 | 'tag': -1,
159 | 'icon': {'id': 5, 'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/21bdb71af69e346fc73098a23e808656.png', 'width': 116, 'height': 45},
160 | 'tagIcon': [],
161 | 'isSecondHand': 0,
162 | 'promotionBanner': {
163 | 'id': 1,
164 | 'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg',
165 | 'default_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg',
166 | 'new_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg',
167 | 'url_v2': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg',
168 | 'url_v2_h': 96,
169 | 'url_v2_w': 750,
170 | 'serverTime': 1555340763
171 | },
172 | 'isMallDsr': 1,
173 | 'hasPromotion': 1,
174 | 'appClientOnly': 0,
175 | 'isColdGoods': 1,
176 | 'singleCardStatus': 0,
177 | 'singleCardCount': 0,
178 | 'goodsProperty': [
179 | {'key': '佩戴方式', 'values': ['入耳式']},
180 | ...
181 | ],
182 | ...
183 | }
184 | ```
185 |
186 | get_anticontent.js来自https://github.com/SergioJune/Spider-Crack-JS/blob/master/pinduoduo/get_anticontent.js
--------------------------------------------------------------------------------
/spiders/WeiBo/tasks.py:
--------------------------------------------------------------------------------
1 | import re
2 | import datetime
3 | import itertools
4 | from pyloom import tasks
5 | from pyloom.errors import *
6 |
7 |
8 | class PWATask(tasks.Task):
9 | _redis = None
10 |
11 | def __init__(self, *args, **kwargs):
12 | super(PWATask, self).__init__(*args, **kwargs)
13 | self.uid = self.url.split(":")[1]
14 | self.client.headers = {
15 | 'Accept': 'application/json, text/plain, */*',
16 | 'Referer': f'https://m.weibo.cn/profile/{self.uid}',
17 | 'MWeibo-Pwa': '1',
18 | 'X-Requested-With': 'XMLHttpRequest',
19 | 'User-Agent': self.ua.chrome
20 | }
21 |
22 | @tasks.retry(tries=16, delay=0, backoff=0)
23 | def download(self, url, params):
24 | """下载并判断是否被封禁"""
25 | try:
26 | resp = self.client.get(url, params, timeout=8)
27 | except (ProxyError, RequestError) as e:
28 | if self.download.count >= 3: # 重试两次后更换代理
29 | self.logger.debug("请求错误", e)
30 | self.client.reload_proxy()
31 | raise RetryError
32 | try:
33 | errno = resp.json.get('errno', None)
34 | except JSONDecodeError:
35 | errno = None
36 | if resp.status_code == 418 or (resp.status_code == 403 and errno == "100005"):
37 | self.logger.debug("响应包错误,IP已被封禁")
38 | self.client.reuse_proxy(150)
39 | self.client.reload_proxy()
40 | raise RetryError
41 | elif errno == "20003":
42 | self.logger.debug("响应包错误,用户不存在", self.uid)
43 | raise TaskFinish()
44 | elif resp.status_code != 200:
45 | self.logger.debug(f"响应包错误,状态码:{resp.status_code}", self.uid)
46 | if self.download.count >= 3:
47 | self.client.reload_proxy()
48 | raise RetryError
49 | elif errno is not None:
50 | msg = resp.json['msg']
51 | self.logger.debug(f"响应包错误,errno={errno},msg={msg}", self.uid)
52 | raise TaskError("msg:" + msg)
53 | else:
54 | self.client.reuse_proxy(0)
55 | return resp
56 |
57 |
58 | class UserTask(PWATask):
59 | """用户资料"""
60 | filters = "user:\w+"
61 |
62 | def on_download(self):
63 | return self.download('https://m.weibo.cn/profile/info', {'uid': self.uid})
64 |
65 | def parse_text(self, _text):
66 | """转换微博内容以节约空间"""
67 |
68 | # 将表情包转为:[拜拜]
69 | def replacer_first(match):
70 | return match.groups()[0]
71 |
72 | text = re.sub(
73 | r']+class="url-icon">\s*
', replacer_first, _text
74 | )
75 |
76 | # 将链接转为:{title}(url)
77 | # title一般为话题
78 | def replacer_link(match):
79 | groups = match.groups()
80 | return f"{{{groups[1]}}}({groups[0]})"
81 |
82 | text = re.sub(
83 | r']+href="([^"]+)".*?>\s*([^<>]+)\s*',
84 | replacer_link, text
85 | )
86 | # 将@连接转为: @XXX
87 | text = re.sub(r']+>(@[^<>]+)', replacer_first, text)
88 | # 将
转为\n
89 | text = text.replace("
", "\n")
90 | return text
91 |
92 | def parse_status(self, _status):
93 | """递归提取源微博与被转发微博"""
94 | status = {
95 | 'mid': int(_status['id']),
96 | 'uid': _status['user']['id'],
97 | 'text': self.parse_text(_status['text']),
98 | 'reposts_count': _status['reposts_count'],
99 | 'comments_count': _status['comments_count'],
100 | 'attitudes_count': _status['attitudes_count'],
101 | 'source': _status['source'],
102 | 'updated_at': datetime.datetime.now(),
103 | 'created_at': datetime.datetime.strptime(
104 | _status['created_at'], "%a %b %d %H:%M:%S %z %Y"),
105 | }
106 | retweeted_status = _status.get('retweeted_status', None)
107 | if retweeted_status: # 转发
108 | status['pmid'] = _status.get('pid', 0)
109 | status['smid'] = int(retweeted_status['id'])
110 | status['suid'] = int(retweeted_status['user']['id'])
111 | repost = status
112 | status, _ = self.parse_status(retweeted_status)
113 | return status, repost
114 | else: # 原创
115 | status['images'] = _status['pic_ids']
116 | status['is_long_text'] = _status['isLongText']
117 | return status, None
118 |
119 | def on_parse(self):
120 | # 用户信息
121 | _user = self.response.json['data']['user']
122 | user = {
123 | 'uid': _user['id'],
124 | 'screen_name': _user['screen_name'],
125 | 'statuses_count': _user['statuses_count'],
126 | 'verified_type': _user['verified_type'],
127 | 'verified_type_ext': _user.get('verified_type_ext', -1),
128 | 'description': _user['description'],
129 | 'gender': _user['gender'],
130 | 'mbtype': _user['mbtype'],
131 | 'urank': _user['urank'],
132 | 'mbrank': _user['mbrank'],
133 | 'followers_count': _user['followers_count'],
134 | 'follow_count': _user['follow_count'],
135 | 'profile_image_id': _user['profile_image_url'].rsplit("/", 1)[1].split(".")[0],
136 | 'status': 0,
137 | 'updated_at': datetime.datetime.now()
138 | }
139 | # 最近微博
140 | statuses = []
141 | reposts = []
142 | for _status in self.response.json['data']['statuses']:
143 | status, repost = self.parse_status(_status)
144 | if status:
145 | statuses.append(status)
146 | if repost:
147 | reposts.append(repost)
148 |
149 | return user, statuses, reposts
150 |
151 | def on_link(self):
152 | return {
153 | 3: [f'follow:{self.uid}'],
154 | 4: [f'fan:{self.uid}']
155 | }
156 |
157 | def on_save(self):
158 | self.logger.info("抓到用户信息", self.result[0])
159 | if self.result[1]:
160 | self.logger.info("抓到原创微博", self.result[1])
161 | if self.result[2]:
162 | self.logger.info("抓到转发微博", self.result[2])
163 |
164 |
165 | class ContainerTask(PWATask):
166 | """解析关注和粉丝列表的响应包"""
167 |
168 | def on_parse(self):
169 | targets = []
170 | for page in self.response:
171 | cards = page.json['data']['cards']
172 | for card in cards:
173 | style = card.get('card_style', None)
174 | group = card['card_group']
175 | if style is None: # 普通用户
176 | targets.extend(g['user']['id'] for g in group)
177 | elif style == 1: # 推荐用户
178 | if len(group) == 3 and 'scheme' in group[2]: # 相关大V用户
179 | if 'users' in group[1]:
180 | ids = [user['id'] for user in group[1]['users']]
181 | elif 'user' in group[1]:
182 | ids = [group[1]['user']['id']]
183 | else:
184 | ids = []
185 | else: # 大V用户
186 | ids = [g['user']['id'] for g in group if 'user' in g]
187 | targets.extend(ids)
188 | else:
189 | raise TaskError(f"card_style={style}")
190 | _targets = []
191 | for t in targets:
192 | try:
193 | _targets.append(int(t))
194 | except ValueError:
195 | pass
196 | return _targets
197 |
198 | def on_link(self):
199 | return {1: [f"user:{uid}" for uid in self.result]} if self.result else {}
200 |
201 |
202 | class FollowerTask(ContainerTask):
203 | """关注列表"""
204 | filters = "follow:\w+"
205 |
206 | def on_download(self):
207 | pages = []
208 | url = "https://m.weibo.cn/api/container/getIndex"
209 | for page_id in itertools.count(1):
210 | params = {"containerid": f"231051_-_followers_-_{self.uid}"}
211 | if page_id != 1:
212 | params['page'] = page_id
213 | resp = self.download(url, params)
214 | if resp.json['ok'] == 0: # 已到最后一页
215 | break
216 | pages.append(resp)
217 | return pages
218 |
219 | def on_save(self):
220 | self.logger.info("抓到关注列表", self.result)
221 |
222 |
223 | class FanTask(ContainerTask):
224 | """粉丝列表"""
225 | filters = "fan:\w+"
226 |
227 | def on_download(self):
228 | pages = []
229 | url = "https://m.weibo.cn/api/container/getIndex"
230 | for since_id in itertools.count(1):
231 | params = {"containerid": f"231051_-_fans_-_{self.uid}"}
232 | if since_id != 1:
233 | params['since_id'] = since_id
234 | resp = self.download(url, params)
235 | if resp.json['ok'] == 0: # 已到最后一页
236 | break
237 | pages.append(resp)
238 | return pages
239 |
240 | def on_save(self):
241 | self.logger.info("抓到粉丝列表", self.result)
242 |
--------------------------------------------------------------------------------
/pyloom/worker.py:
--------------------------------------------------------------------------------
1 | import redis
2 | import signal
3 | import traceback
4 | import threading
5 | import multiprocessing
6 | from . import buckets
7 | from .utils import *
8 | from .tasks import Task, execute
9 | from .scheduler import Spider, Queue
10 |
11 | logger = logging.getLogger("worker")
12 |
13 |
14 | def worker_process(redis_conf, spiders, threads, token_curr, token_new):
15 | """
16 | Worker子进程,负责启动线程
17 | Args:
18 | redis_conf: redis数据库
19 | spiders: 所有爬虫配置表,{name: (path, version)}
20 | threads: 线程数
21 | token_curr: 新建进程时的token
22 | token_new: 父进程中最新的token
23 | 当token_curr与token_new不同时,表示父进程已更新了路由,
24 | 线程在完成当前生命周期后需自行退出
25 | """
26 | logger.debug("Worker进程已启动")
27 | # Manager的共享变量在并发启动过多进程时会出现ConnectionRefusedError
28 | for _ in range(60):
29 | try:
30 | spiders.items()
31 | break
32 | except Exception:
33 | pass
34 | else:
35 | logger.fatal("Worker进程退出,spiders超时未就绪")
36 | return
37 |
38 | thread_ids = []
39 | # 构造路由,{name: [[regex, task]...]}
40 | routers = {}
41 | for name, (path, version) in spiders.items():
42 | tasks = import_tasks(path)
43 | if tasks:
44 | routers[name] = tasks
45 | logger.info("载入爬虫成功", name, version)
46 | else:
47 | logger.info("载入爬虫失败,未发现合规Task类", name, version)
48 | # 启动线程
49 | try:
50 | logger.info("正在启动Worker线程")
51 | signal.signal(signal.SIGINT, signal.SIG_IGN) # 忽略Ctrl+C
52 | for thread_index in range(threads):
53 | thread = threading.Thread(
54 | target=worker_thread,
55 | args=(redis_conf, routers, token_curr, token_new, thread_index)
56 | )
57 | thread.start()
58 | thread_ids.append(thread)
59 | logger.info("Worker线程启动成功")
60 | except Exception as e:
61 | logger.fatal("Worker进程结束,启动Worker线程时出现异常", e, '\n', traceback.format_exc())
62 | return
63 |
64 | for i in itertools.count():
65 | try:
66 | # 清理进程内的过期键
67 | if i % 500 == 0:
68 | count = buckets.LocalBucket.purge()
69 | if count:
70 | logger.debug(f"完成清理LocalBucket", count)
71 | # 线程全部退出后,结束进程
72 | if not any([t.is_alive() for t in thread_ids]):
73 | logger.info("Worker进程结束,线程已全部退出")
74 | return
75 | time.sleep(2)
76 | except Exception as e:
77 | logger.fatal("Worker进程异常", e, '\n', traceback.format_exc())
78 | time.sleep(5)
79 |
80 |
81 | def worker_thread(redis_conf, routers, token_curr, token_new, thread_index):
82 | """
83 | 循环:申请任务->执行任务->上报结果
84 | 线程内捕捉所有异常,永不退出(Ctrl+C除外)
85 | """
86 | logger.debug("Worker线程已启动")
87 | db = redis.StrictRedis.from_url(redis_conf)
88 | pop_failure_count = 0
89 | while True:
90 | try:
91 | # 结束线程
92 | try:
93 | if token_curr != token_new.value:
94 | logger.info("Worker线程结束,收到退出信号")
95 | return
96 | except ConnectionRefusedError:
97 | logger.debug("Token未就绪")
98 | time.sleep(1)
99 | continue
100 | except (BrokenPipeError, FileNotFoundError, EOFError):
101 | logger.info("Worker线程结束,Token已关闭")
102 | return
103 | # 从队列中弹出URL
104 | if not routers:
105 | logger.info("本地爬虫列表为空,等待加载爬虫")
106 | while not routers:
107 | time.sleep(1)
108 | keys = list(routers.keys())
109 | url, name, address = Queue.pop(db, keys)
110 | if not url:
111 | if pop_failure_count % 20 == 0: # 避免日志过多
112 | logger.debug("暂无已就绪任务,稍后重试")
113 | time.sleep(thread_index / 10 + 0.1)
114 | pop_failure_count += 1
115 | continue
116 | logger.info("获得任务", name, url, address)
117 | pop_failure_count = 0
118 | # 匹配Task类并执行
119 | tasks = routers.get(name, None)
120 | queue = Queue(db, name)
121 | if tasks is None:
122 | logger.warning("爬虫匹配失败", name, url)
123 | queue.report_error("none_spider", url)
124 | continue
125 | for regex, task_cls in tasks:
126 | if not regex.match(url):
127 | continue
128 | # 实例化Task并执行
129 | task = task_cls(name, url, db, address)
130 | links = execute(task)
131 | for priority, urls in links.items():
132 | count = queue.add(urls, priority)
133 | logger.debug("添加任务", priority, f"{count}/{len(urls)}")
134 | logger.debug("报告任务完成", queue.report_finish(url), url)
135 | break
136 | else:
137 | logger.warning("任务匹配失败", name, url)
138 | queue.report_error("none_task", url)
139 | except Exception as e:
140 | logger.error("Worker线程异常", e, '\n', traceback.format_exc())
141 | time.sleep(5)
142 |
143 |
144 | def import_tasks(path):
145 | """
146 | 扫描并导入爬虫模块中的Tasks
147 | Return:
148 | [[regex, task]...]
149 | """
150 | tasks = []
151 | # 导入模块
152 | parent = os.path.dirname(path)
153 | if parent not in sys.path:
154 | sys.path.append(parent)
155 | basename = os.path.basename(path)
156 | try:
157 | logger.debug("加载爬虫模块", basename)
158 | _module = importlib.import_module(basename)
159 | except Exception as e:
160 | logger.error("加载爬虫模块异常", e, '\n', traceback.format_exc())
161 | return []
162 | # 扫描模块中合规的Task子类
163 | # 何为合规?
164 | # 1.Task的子类; 2.filters成员; 3.导入无异常; 4.名称不以'__'开头
165 | for name in dir(_module):
166 | if name.startswith("__"):
167 | continue
168 | var = getattr(_module, name)
169 | try:
170 | is_subclass = issubclass(var, Task)
171 | except TypeError:
172 | continue
173 | try:
174 | if is_subclass:
175 | if hasattr(var, 'filters') and isinstance(var.filters, (list, tuple, str)):
176 | if isinstance(var.filters, str):
177 | filters = [var.filters]
178 | else:
179 | filters = var.filters
180 | for regex in filters:
181 | tasks.append([re.compile(regex), var])
182 | logger.info("导入Task类", var.__name__)
183 | else:
184 | logger.warning("忽略Task类", var.__name__, "filters不合规")
185 | continue
186 | else:
187 | continue
188 | except Exception as e:
189 | logger.error("加载Task类异常", e, '\n', traceback.format_exc())
190 | continue
191 | return tasks
192 |
193 |
194 | def start(spider_path, redis_conf, spider_configs, proxies, processes, threads):
195 | """
196 | 重置爬虫状态后运行指定爬虫
197 | Args:
198 | spider_path: 爬虫目录
199 | redis_conf: Redis配置
200 | spider_configs: 爬虫配置
201 | proxies: 使用代理运行
202 | processes: 进程数量
203 | threads: 每个进程的线程数量
204 | """
205 | logger.info("正在启动爬虫")
206 | db = redis.StrictRedis.from_url(redis_conf)
207 | name = os.path.basename(spider_path) # 取目录名为爬虫名
208 | RedisScripts.load(db)
209 | spider = Spider(db, name)
210 | # 注册爬虫/更新同名爬虫配置
211 | logger.info("注册爬虫", name)
212 | logger.info("爬虫配置", spider_configs)
213 | spider.upsert(spider_configs['seeders'], spider_configs['interval'],
214 | spider_configs['timeout'], spider_configs['precision'],
215 | spider_configs['args'], proxies, time.time())
216 | # 重置爬虫状态
217 | status = spider.get_field("status")
218 | if status != 10:
219 | spider.set_field("status", 10)
220 | logger.info(f"重置爬虫状态", "{status} -> 10")
221 | # 回滚'timeout'异常队列
222 | queue = Queue(db, name)
223 | logger.debug("清理Redis")
224 | Queue.purge(db)
225 | logger.info("回滚超时任务")
226 | queue.rollback_tag("timeout", 0)
227 | # 启动Worker
228 | logger.info("正在启动Worker")
229 | spiders = multiprocessing.Manager().dict({name: [spider_path, 0]})
230 | pool = []
231 | token = multiprocessing.Manager().Value('d', 0)
232 | for _ in range(processes):
233 | p = multiprocessing.Process(
234 | target=worker_process,
235 | args=(redis_conf, spiders, threads, token.value, token)
236 | )
237 | p.start()
238 | pool.append(p)
239 | logger.info("Worker启动成功")
240 | try:
241 | # 循环检查爬虫状态,当爬虫停止时终止运行
242 | while True:
243 | time.sleep(0.2)
244 | spider = Spider(db, name)
245 | status = spider.get_field("status")
246 | if status < 10:
247 | logger.info("爬虫停止,当前状态为:", Spider.status.get(status, "未知"))
248 | break
249 | except KeyboardInterrupt:
250 | logger.info("收到Ctrl+C", 'main')
251 | for p in pool:
252 | p.terminate()
253 | logger.info("爬虫停止", "Ctrl+C")
254 | except Exception as e:
255 | logger.error("爬虫停止", "未知异常", e, '\n', traceback.format_exc())
256 |
257 |
258 | def start_all(redis_conf, spiders_path, processes, threads):
259 | """
260 | 启动所有爬虫
261 | Args:
262 | redis_conf: Redis配置
263 | spiders_path: 放置所有爬虫的目录
264 | processes: 进程数量
265 | threads: 每个进程的线程数量
266 | """
267 |
--------------------------------------------------------------------------------
/pyloom/utils.py:
--------------------------------------------------------------------------------
1 | """小工具"""
2 | import re
3 | import os
4 | import sys
5 | import time
6 | import uuid
7 | import types
8 | import logging
9 | import readline
10 | import functools
11 | import itertools
12 | import importlib
13 | from pyloom.errors import *
14 | from importlib.machinery import SourceFileLoader
15 | from logging.handlers import TimedRotatingFileHandler
16 |
17 | logger = logging.getLogger("utils")
18 |
19 |
20 | class ArgDefault(object):
21 | """默认参数"""
22 |
23 | def __bool__(self):
24 | return False
25 |
26 |
27 | def patch_logger_format():
28 | """使logger支持用多个参数构造日志内容"""
29 | log_bak = logging.Logger._log
30 |
31 | def log(self, level, msg, *args):
32 | gap = ' '
33 | out = str(msg) + gap
34 | for value in args[0]:
35 | out = out + str(value) + gap
36 | log_bak(self, level, out, [])
37 |
38 | logging.Logger._log = log
39 |
40 |
41 | def patch_handler_color(handler):
42 | """使handler支持根据日志级别输出彩色日志"""
43 | emit_bak = handler.emit
44 |
45 | def emit(*args):
46 | level = args[0].levelno
47 | if level >= 50:
48 | color = '\x1b[31m' # red, critical
49 | elif level >= 40:
50 | color = '\x1b[31m' # red, error
51 | elif level >= 30:
52 | color = '\x1b[33m' # yellow, warning
53 | elif level >= 20:
54 | color = '\x1b[32m' # green, info
55 | elif level >= 10:
56 | color = '\x1b[35m' # pink, debug
57 | else:
58 | color = '\x1b[0m' # normal
59 | args[0].msg = color + args[0].msg + '\x1b[0m'
60 | return emit_bak(*args)
61 |
62 | handler.emit = emit
63 |
64 |
65 | class RedisScripts(object):
66 | """管理redis-lua脚本"""
67 | _scripts = {}
68 |
69 | @classmethod
70 | def load(cls, db):
71 | path = os.path.join(os.path.dirname(__file__), 'lua')
72 | for filename in os.listdir(path):
73 | lua_file = os.path.join(path, filename)
74 | with open(lua_file, encoding="utf-8") as f:
75 | sha1 = db.script_load(f.read())
76 | command = filename.split('.')[0]
77 | RedisScripts._scripts[command] = sha1
78 | logger.info("缓存Lua脚本", command, sha1)
79 |
80 | @classmethod
81 | def sha1(cls, command):
82 | return RedisScripts._scripts[command]
83 |
84 |
85 | def dict_merge(base: dict, delta: dict, check_not_none=True) -> dict:
86 | """
87 | 将delta递归合并至base,覆盖同名字段
88 | 若is_not_none为True,
89 | 合并后不应有值为ConfigNotNone,否则抛出ConfigNotNone异常
90 | Example:
91 | # 递归合并,修改实参
92 | >>> base = {'redis': {'host': '127.0.0.1', 'port': 6379}}
93 | >>> delta = {'redis': {'host': '192.168.1.1'}}
94 | >>> dict_merge(base, delta)
95 | {'redis': {'host': '192.168.1.1', 'port': 6379}}
96 | >>> base
97 | {'redis': {'host': '192.168.1.1', 'port': 6379}}
98 |
99 | # 参数check_not_none
100 | >>> base = {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}}
101 | >>> delta = {'redis': {'host': '192.168.1.1'}}
102 | >>> dict_merge(base, delta, True)
103 | Traceback (most recent call last):
104 | ...
105 | pyloom.errors.ConfigNotNone: 缺少配置项:'port'
106 | >>> base = {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}}
107 | >>> dict_merge(base, delta, False)
108 | {'redis': {'host': '192.168.1.1', 'port': }}
109 | """
110 | if not isinstance(base, dict):
111 | return delta
112 | common_keys = set(base).intersection(delta)
113 | new_keys = set(delta).difference(common_keys)
114 | for key in common_keys:
115 | base[key] = dict_merge(base[key], delta[key], check_not_none)
116 | for key in new_keys:
117 | base[key] = delta[key]
118 | if check_not_none:
119 | for key in base:
120 | if base[key] is ConfigNotNone:
121 | raise ConfigNotNone(key)
122 | return base
123 |
124 |
125 | def retry(tries=-1, delay=1, max_delay=None, backoff=0, catches=None, error=None):
126 | """
127 | 自动重试
128 |
129 | 当delay=1,backoff=0时,依此休眠:
130 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
131 | 当delay=1,backoff=1时,依此休眠:
132 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
133 | 当delay=1,backoff=2时,依此休眠:
134 | [1, 2, 5, 10, 17, 26, 37, 50, 65, 82]
135 |
136 | Args:
137 | tries: 重试次数,(-1:不限重试次数)
138 | delay: 初始重试秒数
139 | max_delay: 最大重试秒数(None:不限)
140 | backoff: 退避指数
141 | catches: 可被捕捉的异常(RetryError始终可用)
142 | error: 达到最大重试次数时抛出的异常(默认RetryExceeded)
143 | """
144 | if catches is None:
145 | catches = []
146 |
147 | def decorator(func):
148 | @functools.wraps(func)
149 | def wrapper(*args, **kwargs):
150 | # 记数生成器
151 | if tries >= 0:
152 | count = range(tries)
153 | else:
154 | count = itertools.count()
155 | # 处理重试
156 | for i in count:
157 | setattr(wrapper, "count", i)
158 | try:
159 | return func(*args, **kwargs)
160 | except (RetryError, *catches):
161 | if backoff == 0:
162 | sleep = delay
163 | else:
164 | sleep = delay + i ** backoff
165 | if max_delay:
166 | sleep = min(sleep, max_delay)
167 | time.sleep(sleep)
168 | # 重试次数超限
169 | else:
170 | if error is None:
171 | raise RetryExceeded
172 | else:
173 | raise error
174 |
175 | return wrapper
176 |
177 | return decorator
178 |
179 |
180 | def template_input(template):
181 | """
182 | 在命令行提示用户输入配置
183 | Args:
184 | template: 配置模板
185 | 例如完成员工信息填写:
186 | ArgDefault表示必填参数,无默认值
187 | [
188 | {
189 | "name": 配置名,
190 | "title": 配置标题,
191 | "example": 示例,
192 | "default": 默认值(留空表示必填参数),
193 | "note": 提示信息,
194 | "type": 类型转换函数
195 | }
196 | ]
197 | """
198 | configs = {}
199 | for fields in template:
200 | name = fields['name']
201 | default = fields.get('default', ArgDefault)
202 | example = fields.get('example', ArgDefault)
203 | note = fields.get('note', ArgDefault)
204 | title = fields.get('title', name)
205 | regex = fields.get('regex', ArgDefault)
206 | _type = fields.get('type', ArgDefault)
207 | _range = fields.get('range', ArgDefault)
208 | if _type is not ArgDefault:
209 | output = f"{title}[{_type.__name__}]\n"
210 | else:
211 | output = f"{title}\n"
212 | if example is not ArgDefault:
213 | output += f"示例: {example}\n"
214 | if note is not ArgDefault:
215 | output += f"提示: {note}\n"
216 | output += '➜ '
217 | first = True
218 | while True:
219 | if first:
220 | var = input(output)
221 | first = False
222 | else:
223 | var = input('➜ ')
224 | if var:
225 | # 类型检查
226 | if _type is not ArgDefault:
227 | try:
228 | var = _type(var)
229 | except ValueError:
230 | print(f"参数类型有误,请重试")
231 | continue
232 | # 范围检查
233 | if _range is not ArgDefault and var not in _range:
234 | print(f"参数范围有误,请重试")
235 | continue
236 | # 正则检查
237 | if regex is not ArgDefault and not re.match(regex, var):
238 | print(f"参数格式有误,请重试")
239 | continue
240 | break
241 | elif not var and default is not ArgDefault:
242 | var = default
243 | break
244 | else:
245 | print(f"参数不可留空,请重试")
246 | configs[name] = var
247 | return configs
248 |
249 |
250 | def load_py_configs(file) -> dict:
251 | """
252 | 加载PY格式的配置文件,当配置文件为空时,返回{}
253 | Args:
254 | file: 配置文件路径
255 | """
256 | if not os.path.exists(file):
257 | raise ConfigFileNotFoundError(file)
258 | m = SourceFileLoader(uuid.uuid4().hex, file).load_module()
259 | return {k: v for k, v in vars(m).items() if not k.startswith('__')}
260 |
261 |
262 | def load_spider_configs(path) -> dict:
263 | """
264 | 加载爬虫配置
265 | Args:
266 | path: 爬虫目录
267 | """
268 | _configs = {
269 | "seeders": ConfigNotNone,
270 | "interval": 3,
271 | "timeout": 120,
272 | "precision": 0.0001,
273 | "args": {}
274 | }
275 | conf = os.path.join(path, 'configs.py')
276 | if not os.path.exists(conf):
277 | raise ConfigFileNotFoundError(f"ERR: 未找到爬虫配置:'{conf}'")
278 | return dict_merge(_configs, load_py_configs(conf))
279 |
280 |
281 | def tail(file):
282 | """模仿linux中的tail命令"""
283 | try:
284 | with open(file, 'rb') as f:
285 | for i in range(1, 11):
286 | try:
287 | f.seek(-(i ** 3), 2)
288 | except OSError:
289 | f.seek(-((i - 1) ** 3), 2)
290 | break
291 | while True:
292 | line = f.readline()
293 | if not line:
294 | time.sleep(0.1)
295 | continue
296 | try:
297 | yield line.decode('utf8')
298 | except UnicodeDecodeError:
299 | time.sleep(0.1)
300 | continue
301 | except KeyboardInterrupt:
302 | pass
303 |
--------------------------------------------------------------------------------
/pyloom/scheduler.py:
--------------------------------------------------------------------------------
1 | """Scheduler SDK"""
2 | import json
3 | import time
4 | import copy
5 | import random
6 | import logging
7 | from .errors import *
8 | from redis import StrictRedis
9 | from . import utils, tasks, buckets
10 |
11 | key_spiders = "spiders" # set
12 | logger = logging.getLogger("scheduler")
13 |
14 |
15 | class Spider(object):
16 | prefix = "spider"
17 | _caches = {}
18 | _timeout = 10
19 | status = {
20 | 0: '已完成',
21 | 10: '就绪',
22 | 20: '等待代理', # 暂未实现
23 | 21: '等待时间', # 暂未实现
24 | -1: '异常关闭',
25 | -2: '主动关闭'
26 | }
27 |
28 | def __init__(self, db: StrictRedis, name):
29 | self.name = name # 爬虫名
30 | self._db = db
31 | self.key = f"{self.prefix}:{self.name}" # 主键
32 | self.fields = { # 爬虫所有字段及可缓存时间
33 | "interval": 300,
34 | "timeout": 300,
35 | "precision": 10000,
36 | "args": 300,
37 | "last_pop_time": 1,
38 | "status": 1,
39 | "version": 1,
40 | "proxies": 1,
41 | "last_heartbeat_time": 6
42 | }
43 |
44 | def exists(self):
45 | """爬虫是否存在"""
46 | return self._db.exists(self.key)
47 |
48 | def upsert(self, seeders, interval, timeout, precision, args, proxies, version):
49 | """
50 | 新建爬虫或覆盖同名爬虫的配置(仅当版本号更大时)
51 | Args:
52 | seeders: 种子页面
53 | interval: 最小调度间隔(误差由pop频率决定)
54 | timeout: 任务超时时间
55 | precision: 布隆过滤器精度
56 | args: 自定义爬虫参数
57 | proxies: 使用代理运行
58 | version: 配置版本,等于目录的sha1值
59 | Returns:
60 | T/F: 是否更新了配置
61 | """
62 | # 当前版本比数据库的还小就不更新了
63 | _version = self._get_field("version")
64 | if _version is not None and version <= _version:
65 | return False
66 | # 忽略更新precision字段
67 | _precision = self._get_field("precision")
68 | if _precision is not None:
69 | precision = _precision
70 | # 爬虫配置
71 | values = {
72 | "interval": interval,
73 | "timeout": timeout,
74 | "precision": precision,
75 | "args": args,
76 | "last_pop_time": 0,
77 | "status": 10, # 0:已完成,10:就绪,20:等待代理,21:等待时间,-1:异常关闭,-2:主动关闭
78 | "version": version,
79 | "proxies": proxies, # 代理配置
80 | "last_heartbeat_time": 0, # 最后一次尝试申请任务的时间
81 | }
82 | self._db.hmset(self.key, {k: json.dumps(v) for k, v in values.items()})
83 | self._db.sadd(key_spiders, self.name)
84 | # 将种子URL入队
85 | queues = Queue(self._db, self.name)
86 | queues.add(seeders, 0)
87 | return True
88 |
89 | def _get_field(self, field):
90 | """从数据库中查询并返回爬虫的配置项"""
91 | if field not in self.fields:
92 | raise SchedulerError(f"没有此配置项'{field}'")
93 |
94 | res = self._db.hget(self.key, field)
95 | if res is None:
96 | return None
97 | else:
98 | return json.loads(res)
99 |
100 | def get_field(self, field):
101 | """依此从缓存、数据库中查询并返回爬虫的配置项"""
102 | if field not in self.fields:
103 | raise SchedulerError(f"没有此配置项'{field}'")
104 |
105 | timeout = self.fields[field]
106 | # 先尝试从缓存取值
107 | cache_key = f"{self.name}:{field}"
108 | var, start = Spider._caches.get(cache_key, (None, 0))
109 | if start + timeout < time.time():
110 | # 缓存过期或无缓存
111 | var = self._get_field(field)
112 | if timeout > 0:
113 | Spider._caches[cache_key] = (var, time.time())
114 | return var
115 |
116 | def set_field(self, field, value):
117 | """覆写爬虫的配置项"""
118 | if field not in self.fields:
119 | raise SchedulerError(f"没有此配置项'{field}'")
120 | if field == 'precision':
121 | raise SchedulerError(f"配置项被锁定'{field}'")
122 |
123 | self._db.hset(self.key, field, json.dumps(value))
124 | # 设置缓存
125 | cache_key = f"{self.name}:{field}"
126 | timeout = self.fields[field]
127 | if timeout > 0:
128 | Spider._caches[cache_key] = (value, time.time())
129 |
130 | @classmethod
131 | def names(cls, db: StrictRedis):
132 | """返回所有爬虫名称的列表"""
133 | return [r.decode() for r in db.smembers(key_spiders)]
134 |
135 | def clear_queue(self):
136 | """清除该爬虫在队列中留存的数据"""
137 | keys = []
138 | keys += self._db.keys(f"{Spider.prefix}:{self.name}")
139 | keys += self._db.keys(f"{Queue.prefix}:{self.name}:*")
140 | keys += self._db.keys(f"{buckets.ShareBucket.prefix}:{self.name}:*")
141 | keys += self._db.keys(f"{tasks.Tracking.prefix}:{self.name}:*")
142 | return self._db.delete(*keys) if keys else 0
143 |
144 | def clear_proxy(self):
145 | """清空该爬虫的代理池"""
146 | count = self._db.delete(f"proxy:addresses:{self.name}")
147 | count += self._db.srem(key_spiders, self.name)
148 | return count
149 |
150 |
151 | class Queue(object):
152 | prefix = "queue"
153 |
154 | def __init__(self, db: StrictRedis, name):
155 | self.name = name # 爬虫名
156 | self._db = db
157 | self._spider = Spider(db, name)
158 | # 等待队列(list),5个优先级分别用5个list实现,左进右出
159 | # [[url0, url1], [url0, url1], [url0, url1]]
160 | self.key_waiting = [f"{self.prefix}:{self.name}:waiting:{i}" for i in range(5)]
161 | # 进行队列(hash),field=url, value=timestamp
162 | self.key_processing = f"{self.prefix}:{self.name}:processing"
163 | # 异常标签(set)
164 | self.key_tags = f"{self.prefix}:{self.name}:tags"
165 | # 异常队列(list)
166 | self.prefix_error = f"{self.prefix}:{self.name}:errors" # :{tag}
167 | # 队列过滤器(set),过滤waiting、processing、errors中的URl
168 | self.key_filter_queue = f"{self.prefix}:{self.name}:filter:queue"
169 | # 结果过滤器(string or set),过滤已抓取完成的URL
170 | # 结果过滤器有两种实现:set、bloom,通过爬虫配置项'queue.filter'选择适合的实现
171 | self.key_filter_bloom_count = f"{self.prefix}:{self.name}:filter:bloom:count"
172 |
173 | def exists(self, url):
174 | """
175 | URL是否存在
176 | Returns:
177 | 0: 不存在
178 | 1: 存在于bloom中
179 | 2: 存在于queue中
180 | """
181 | # 在results中找
182 | sha = utils.RedisScripts.sha1('bloom_check')
183 | if self._db.evalsha(sha, 1, self.name, url):
184 | return 1
185 | # 在queue中找
186 | if self._db.sismember(self.key_filter_queue, url):
187 | return 2
188 | else:
189 | return 0
190 |
191 | def insert(self, url, priority):
192 | """忽略布隆检查,将URL插入至队列中"""
193 | self._db.lpush(self.key_waiting[priority], url)
194 | self._db.sadd(self.key_filter_queue, url)
195 | self._db.hdel(self.key_processing, url)
196 |
197 | def add(self, urls, priority):
198 | """
199 | URL批量入队
200 | 当URL相同,但priority不同时,也视为重复
201 | Returns: 经排重后添加至队列的数量
202 | """
203 | if not isinstance(priority, int):
204 | raise SchedulerError("priority应为int型")
205 | if priority < 0 or priority >= len(self.key_waiting):
206 | raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}")
207 |
208 | urls = list(set(urls))
209 | sha = utils.RedisScripts.sha1('url_add')
210 | return self._db.evalsha(sha, 2, self.name, priority, *urls)
211 |
212 | @classmethod
213 | def pop(cls, db: StrictRedis, names):
214 | """
215 | 从指定爬虫中弹出一条最合适的URL
216 | Returns: (url, name)
217 | 当所有队列为空时,url == name == None
218 | """
219 | # 随机挑选爬虫
220 | names = copy.deepcopy(names)
221 | random.shuffle(names)
222 |
223 | sha = utils.RedisScripts.sha1('url_pop')
224 | url, name, address = db.evalsha(sha, 1, time.time(), *names)
225 | if url and name:
226 | return [url.decode(), name.decode(), address.decode() if address else None]
227 | else:
228 | return [None, None, None]
229 |
230 | @classmethod
231 | def purge(cls, db: StrictRedis):
232 | """
233 | 清理processing中过期的URL,返回被清理数量
234 | 被清理的URL,将被打上"timeout"标签,移入error队列
235 | """
236 | count = 0
237 | for name in Spider.names(db):
238 | key = f"{cls.prefix}:{name}:processing"
239 | queue = cls(db, name)
240 | timeout = Spider(db, name).get_field("timeout")
241 | # redis的scan是可能重复返回同一元素的
242 | for url, _start in db.hscan_iter(key):
243 | if time.time() > float(_start) + timeout: # 过期
244 | count += queue.report_error("timeout", url)
245 | return count
246 |
247 | def report_finish(self, url):
248 | """标记URL为已完成状态"""
249 | if not self._db.hdel(self.key_processing, url):
250 | return False
251 | self._db.srem(self.key_filter_queue, url)
252 | sha = utils.RedisScripts.sha1('bloom_cas')
253 | logger.debug("report_finish", self.name, url)
254 | return self._db.evalsha(sha, 1, self.name, url)
255 |
256 | def report_error(self, tag, url):
257 | """标记URL为异常状态"""
258 | if not self._db.hdel(self.key_processing, url):
259 | return False
260 | self._db.sadd(self.key_tags, tag)
261 | return self._db.lpush(f"{self.prefix_error}:{tag}", url)
262 |
263 | @property
264 | def tags(self):
265 | """获取标签列表"""
266 | return {
267 | r.decode(): self._db.llen(f"{self.prefix_error}:{r.decode()}")
268 | for r in self._db.smembers(self.key_tags)
269 | }
270 |
271 | def get_errors(self, tag, count=0):
272 | """获取指定标签下的所有异常URL"""
273 | key = f"{self.prefix_error}:{tag}"
274 | return [r.decode() for r in self._db.lrange(key, 0, count - 1)]
275 |
276 | def remove_tag(self, tag):
277 | key = f"{self.prefix_error}:{tag}"
278 | self._db.srem(self.key_tags, tag)
279 | return self._db.delete(key)
280 |
281 | def rollback_tag(self, tag, priority):
282 | """
283 | 将指定标签下的异常URL移至waiting队列中
284 | 返回回滚的URL数量
285 | """
286 | if not isinstance(priority, int):
287 | raise SchedulerError("priority应为int型")
288 | if priority < 0 or priority >= len(self.key_waiting):
289 | raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}")
290 | key_errors = f"{self.prefix_error}:{tag}"
291 | # 取出并删除异常URL、标签
292 | pipe = self._db.pipeline()
293 | pipe.lrange(key_errors, 0, -1) # 取出所有
294 | pipe.delete(key_errors) # 删除队列
295 | pipe.srem(self.key_tags, tag) # 删除标签
296 | res = pipe.execute()
297 | # 添加至waiting
298 | urls = res[0]
299 | if urls:
300 | self._db.lpush(self.key_waiting[priority], *urls)
301 | return len(urls)
302 |
303 | @property
304 | def details(self):
305 | """队列信息"""
306 | return {
307 | 'waiting': [self._db.llen(key) for key in self.key_waiting],
308 | 'processing': self._db.hlen(self.key_processing),
309 | 'results': int(self._db.get(self.key_filter_bloom_count) or 0),
310 | 'errors': sum([self._db.llen(f"{self.prefix_error}:{tag}") for tag in self.tags])
311 | }
312 |
--------------------------------------------------------------------------------
/pyloom/tasks.py:
--------------------------------------------------------------------------------
1 | import furl
2 | import json
3 | import redis
4 | import random
5 | import requests
6 | import traceback
7 | import simplejson.errors
8 | from .utils import *
9 | from .errors import *
10 | from lxml import etree
11 | from typing import List
12 | from typing import Union
13 | from . import scheduler, errors
14 | from bs4 import BeautifulSoup, element
15 | from .buckets import LocalBucket, ShareBucket
16 |
17 | logger = logging.getLogger("tasks")
18 |
19 |
20 | class Queue(object):
21 | """队列控制器"""
22 |
23 | def __init__(self, db, name, url):
24 | self._spider = scheduler.Spider(db, name)
25 | self._queue = scheduler.Queue(db, name)
26 | self.url = url
27 |
28 | @property
29 | def detail(self):
30 | return self._queue.details
31 |
32 | @property
33 | def timeout(self):
34 | return self._spider.get_field("timeout")
35 |
36 | @timeout.setter
37 | def timeout(self, value):
38 | if not isinstance(value, (int, float)):
39 | raise errors.TaskError("timeout应为int或float型")
40 | self._spider.set_field("timeout", value)
41 |
42 | @property
43 | def interval(self):
44 | return self._spider.get_field("interval")
45 |
46 | @interval.setter
47 | def interval(self, value):
48 | if not isinstance(value, (int, float)):
49 | raise errors.TaskError("interval应为int或float型")
50 | self._spider.set_field("interval", value)
51 |
52 | def freeze(self, seconds):
53 | """暂停调度seconds秒"""
54 | last_pop_time = time.time() + seconds - self.interval
55 | self._spider.set_field("last_pop_time", last_pop_time)
56 | logger.info("暂停调度", seconds)
57 |
58 | def stop(self):
59 | """停止调度,爬虫状态更改为'stop'"""
60 | logger.info("爬虫状态更改为'stop'")
61 | self._spider.set_field("status", -2)
62 |
63 | def finish(self):
64 | """
65 | 提前完成调度,爬虫状态更改为'finish'
66 | 默认情况下,当所有队列均为空时,爬虫状态自动变为'finish'
67 | """
68 | logger.info("爬虫状态更改为'finish'")
69 | self._spider.set_field("status", 0)
70 |
71 |
72 | class UserAgent(object):
73 | _ua = None
74 | _browsers = None
75 |
76 | def __getitem__(self, item):
77 | if UserAgent._ua is None:
78 | filename = os.path.join(os.path.dirname(__file__), "user-agent.json")
79 | with open(filename, encoding='utf8') as f:
80 | UserAgent._ua = json.load(f)
81 | UserAgent._browsers = list(UserAgent._ua.keys())
82 | if item == 'random':
83 | item = random.choice(UserAgent._browsers)
84 | return random.choice(UserAgent._ua[item])
85 |
86 | # 便于IDE提示
87 | @property
88 | def chrome(self):
89 | return self["chrome"]
90 |
91 | @property
92 | def ie(self):
93 | return self["ie"]
94 |
95 | @property
96 | def safari(self):
97 | return self["safari"]
98 |
99 | @property
100 | def firefox(self):
101 | return self["firefox"]
102 |
103 | @property
104 | def android(self):
105 | return self["android"]
106 |
107 | @property
108 | def random(self):
109 | return self["random"]
110 |
111 |
112 | class CSS(object):
113 | def __init__(self, root, pattern=":root"):
114 | if isinstance(root, (element.Tag, type(None))):
115 | self._root = root
116 | elif isinstance(root, str):
117 | self._root = BeautifulSoup(root, "lxml")
118 | else:
119 | raise errors.TaskError(f"不支持从'{type(root)}'类型构造CSS")
120 |
121 | self._pattern = pattern
122 | self._default = ArgDefault
123 |
124 | def __bool__(self):
125 | return self._root is not None
126 |
127 | def __repr__(self):
128 | return f"CSS('{self._pattern}')"
129 |
130 | def one(self, pattern):
131 | node = self._root.select_one(pattern)
132 | return CSS(node, pattern) # type: CSS
133 |
134 | def many(self, pattern) -> List['CSS']:
135 | nodes = self._root.select(pattern)
136 | return [CSS(node, pattern) for node in nodes]
137 |
138 | def exist(self, pattern):
139 | return bool(self.one(pattern))
140 |
141 | def default(self, value):
142 | self._default = value
143 | return self
144 |
145 | def text(self, regex=None, strip=True, separator=""):
146 | if self._root is None:
147 | if self._default is ArgDefault:
148 | raise errors.TaskError(f"未找到:{repr(self)}")
149 | else:
150 | # 默认值不校验格式,直接返回
151 | return self._default
152 | _text = self._root.get_text(separator, strip)
153 | if regex is None or re.match(regex, _text):
154 | return _text
155 | else:
156 | raise errors.TaskError(f"未通过正则校验:{regex}")
157 |
158 | def html(self):
159 | if self._root is None:
160 | if self._default is ArgDefault:
161 | raise errors.TaskError(f"未找到:{repr(self)}")
162 | else:
163 | # 默认值不校验格式,直接返回
164 | return self._default
165 | return str(self._root)
166 |
167 | @property
168 | def attrs(self):
169 | return self._root.attrs
170 |
171 |
172 | class XPath(object):
173 | def __init__(self, root, pattern="/*"):
174 | if isinstance(root, (etree._Element, type(None))):
175 | self._root = root
176 | elif isinstance(root, str):
177 | self._root = etree.HTML(root)
178 | else:
179 | raise errors.TaskError(f"不支持从'{type(root)}'类型构造XPath")
180 |
181 | self._pattern = pattern
182 | self._default = ArgDefault
183 |
184 | def __bool__(self):
185 | return self._root is not None
186 |
187 | def __repr__(self):
188 | return f"XPath('{self._pattern}')"
189 |
190 | def one(self, pattern):
191 | nodes = self._root.xpath(pattern)
192 | if nodes:
193 | return XPath(nodes[0])
194 | else:
195 | return XPath(None)
196 |
197 | def many(self, pattern):
198 | nodes = self._root.xpath(pattern)
199 | return [XPath(node, pattern) for node in nodes]
200 |
201 | def exist(self, pattern):
202 | return bool(self.one(pattern))
203 |
204 | def default(self, value):
205 | self._default = value
206 | return self
207 |
208 | def text(self, regex=None, strip=True):
209 | if self._root is None:
210 | if self._default is ArgDefault:
211 | raise errors.TaskError(f"未找到{repr(self)}")
212 | else:
213 | # 默认值不校验格式,直接返回
214 | return self._default
215 | _text = self._root.text
216 | _text = '' if _text is None else _text
217 | _text = _text.strip() if strip else _text
218 | if regex is None or re.match(regex, _text):
219 | return _text
220 | else:
221 | raise errors.TaskError(f"未通过正则校验:{regex}")
222 |
223 | @property
224 | def attrs(self):
225 | return self._root.attrib
226 |
227 |
228 | class Regex(object):
229 | def __init__(self, root):
230 | self._root = root
231 |
232 | def __bool__(self):
233 | return self._root is not None
234 |
235 | def many(self, pattern):
236 | return re.findall(pattern, self._root)
237 |
238 |
239 | class Response(object):
240 | def __init__(self, resp: requests.Response):
241 | self._resp = resp
242 | self.encoding = "utf-8"
243 | # 解析器
244 | self._css = None # type: CSS
245 | self._xpath = None # type: XPath
246 | self._json = None # type: dict
247 | self._re = None # type: Regex
248 |
249 | self.content = resp.content
250 | self.status_code = resp.status_code
251 | self.url = resp.url
252 | self.furl = furl.furl(resp.url)
253 | self.request = resp.request # type: requests.PreparedRequest
254 | self.history = resp.history # type: list
255 | self.cookies = resp.cookies # type: dict
256 | self.headers = resp.headers # type: dict
257 |
258 | @property
259 | def re(self) -> Regex:
260 | if not self._re:
261 | self._re = Regex(self.text)
262 | return self._re
263 |
264 | @property
265 | def text(self) -> str:
266 | self._resp.encoding = self.encoding
267 | return self._resp.text
268 |
269 | @property
270 | def json(self) -> dict:
271 | if self._json:
272 | return self._json
273 | try:
274 | self._json = self._resp.json()
275 | return self._json
276 | except simplejson.errors.JSONDecodeError:
277 | raise errors.JSONDecodeError
278 |
279 | @property
280 | def css(self) -> CSS:
281 | if self._css is None:
282 | self._css = CSS(self.content.decode(self.encoding))
283 | return self._css
284 |
285 | @property
286 | def xpath(self) -> XPath:
287 | if self._xpath is None:
288 | self._xpath = XPath(self.content.decode(self.encoding))
289 | return self._xpath
290 |
291 | def __repr__(self):
292 | return f"Response({self.status_code})"
293 |
294 |
295 | class Tracking(object):
296 | """数据埋点"""
297 | prefix = 'tracking'
298 |
299 | def __init__(self, name, db):
300 | self._name = name
301 | self._db = db
302 |
303 | def incr(self, field, amount=1):
304 | return self._db.incr(f"{self.prefix}:{self._name}:{field}", amount)
305 |
306 | def get(self, field):
307 | r = self._db.get(f"{self.prefix}:{self._name}:{field}")
308 | return int(r) if r else None
309 |
310 | @property
311 | def fields(self):
312 | return [i.decode().split(":", 2)[2] for i in self._db.keys(f"{self.prefix}:{self._name}:*")]
313 |
314 |
315 | class Client(object):
316 | """封装requests,便于包装响应包、掌管代理"""
317 |
318 | def __init__(self, name, db, address=None):
319 | self._address = address
320 | self._set_address(address)
321 | self.name = name
322 | self.headers = {}
323 | self._db = db # type: redis.StrictRedis
324 | self._session = requests
325 | self._reuse = False
326 |
327 | def session(self):
328 | """返回跨请求保留Cookie的客户端"""
329 | client = Client(self.name, self._db, self._address)
330 | client._session = requests.session()
331 | return client
332 |
333 | def request(self, method, url, **kwargs):
334 | try:
335 | headers = {**self.headers, **kwargs.pop("headers", {})}
336 | proxies = {**self.proxies, **kwargs.pop("proxies", {})}
337 | resp = self._session.request(
338 | method, url,
339 | headers=headers,
340 | proxies=proxies,
341 | **kwargs
342 | )
343 | except requests.exceptions.Timeout as e:
344 | raise errors.Timeout(e)
345 | except requests.exceptions.ProxyError as e:
346 | raise errors.ProxyError(e)
347 | except requests.exceptions.RequestException as e:
348 | raise errors.RequestError(e)
349 | except Exception as e:
350 | raise e
351 | return Response(resp)
352 |
353 | def get(self, url, params=None, **kwargs):
354 | return self.request("get", url, params=params, **kwargs)
355 |
356 | def post(self, url, data=None, json=None, **kwargs):
357 | return self.request("post", url, data=data, json=json, **kwargs)
358 |
359 | def head(self, url, **kwargs):
360 | return self.request("head", url, **kwargs)
361 |
362 | def options(self, url, **kwargs):
363 | return self.request("options", url, **kwargs)
364 |
365 | def patch(self, url, data=None, **kwargs):
366 | return self.request("patch", url, data=data, **kwargs)
367 |
368 | def put(self, url, data=None, **kwargs):
369 | return self.request("put", url, data=data, **kwargs)
370 |
371 | def delete(self, url, **kwargs):
372 | return self.request("delete", url, **kwargs)
373 |
374 | def _set_address(self, address):
375 | if address:
376 | proxy = address.split(":", 2)[2]
377 | self.proxies = {
378 | "http": proxy,
379 | "https": proxy
380 | }
381 | self.proxy = proxy
382 | self.address = address
383 | logger.debug("设置代理", proxy)
384 | else:
385 | self.proxies = {}
386 | self.proxy = None
387 | self.address = None
388 |
389 | def reload_proxy(self) -> bool:
390 | """丢弃当前代理并更换新代理,若代理池已无可用代理,返回False"""
391 | recycle = []
392 | try:
393 | while True:
394 | address = self._db.rpop(f"proxy:addresses:{self.name}")
395 | # 代理池空了
396 | if not address:
397 | raise TaskBreak(0)
398 | address = address.decode() # type: str
399 | _valid_at, _expire_at, _ = address.split(":", 2)
400 | valid_at, expire_at = float(_valid_at), float(_expire_at)
401 | # 未到可用时间,还回去
402 | if valid_at > time.time():
403 | recycle.append(address)
404 | continue
405 | # 已到可用时间,但过期了,直接丢弃
406 | if expire_at < time.time():
407 | continue
408 | self._set_address(address)
409 | return True
410 | finally:
411 | if recycle:
412 | self._db.lpush(f"proxy:addresses:{self.name}", *recycle)
413 |
414 | def reuse_proxy(self, freeze=0):
415 | """回收代理,并在freeze秒后可再次被分配"""
416 | # 只可reuse一次
417 | if self._reuse:
418 | return
419 | else:
420 | self._reuse = True
421 | if self.address:
422 | _, expire_at, proxy = self.address.split(":", 2)
423 | valid_at = time.time() + freeze
424 | self._db.lpush(f"proxy:addresses:{self.name}", f"{valid_at}:{expire_at}:{proxy}")
425 | logger.debug("回收代理", f"{valid_at}:{expire_at}:{proxy}")
426 | self._set_address(None)
427 |
428 | def __setattr__(self, key, value):
429 | if key in ['params', 'history']:
430 | setattr(self._session, key, value)
431 | else:
432 | super(Client, self).__setattr__(key, value)
433 |
434 |
435 | class Buckets(object):
436 | """数据存储"""
437 |
438 | def __init__(self, local, share):
439 | self.local = local # type: LocalBucket
440 | self.share = share # type: ShareBucket
441 |
442 |
443 | class Task(object):
444 | """描述爬虫行为的抽象类"""
445 | filters = []
446 |
447 | def __init__(self, name, url, db, address):
448 | """
449 | Args:
450 | name: 爬虫名
451 | url: 当前URL
452 | db: redis数据库(用户不可使用)
453 | address: 代理地址
454 | """
455 | self._spider = scheduler.Spider(db, name)
456 | self._queue = scheduler.Queue(db, name)
457 | self._db = db # type: redis.StrictRedis
458 |
459 | self.url = url # type: str
460 | self.furl = furl.furl(url)
461 | self.name = name # type: str
462 | self.logger = logging.getLogger(name)
463 | self.client = Client(name, db, address)
464 | self.queue = Queue(db, name, url)
465 | self.ua = UserAgent()
466 | self.buckets = Buckets(LocalBucket.instance(name), ShareBucket(db, name))
467 | self.args = self._spider.get_field("args")
468 | self.lock = self._db.lock # 分布式锁
469 | self.tracking = Tracking(name, db)
470 | self.result = None
471 | self.response = None # type: Response
472 |
473 | def on_download(self) -> Response:
474 | """下载并返回响应包"""
475 | raise NotImplementedError()
476 |
477 | def on_parse(self) -> dict:
478 | """提取并返回目标数据"""
479 | return {}
480 |
481 | def on_link(self) -> Union[list, dict]:
482 | """
483 | 提取并返回新链接
484 | Returns:
485 | links: links可以是list和dict两种类型
486 | dict: 指定不同的优先级: {priority: urls}
487 | list: 将links中的url添加到优先级为1的队列中
488 | 相当于: {1: urls}
489 | """
490 |
491 | def on_save(self):
492 | """存储数据"""
493 | self.logger.debug("on_save", self.result)
494 |
495 | def on_finish(self):
496 | """已完成"""
497 |
498 | def on_error(self, e) -> bool:
499 | """
500 | 处理生命周期中抛出的异常(包括on_finish)
501 | Returns:
502 | True: 异常已被处理
503 | False: 异常无法处理
504 | """
505 | return False
506 |
507 |
508 | def execute(task: Task):
509 | """
510 | 运行task实例并处理所有异常
511 | Returns:
512 | links: {priority: urls}
513 | """
514 | try:
515 | task.tracking.incr('on_download')
516 | task.response = task.on_download()
517 | task.tracking.incr('on_download_ok')
518 | task.result = task.on_parse()
519 | links = task.on_link()
520 | if isinstance(links, list):
521 | links = {3: links}
522 | elif links is None:
523 | links = {}
524 | elif not isinstance(links, dict):
525 | raise errors.TaskError(f"on_link返回值应是list或dict型,而非{type(links)}")
526 | task.on_save()
527 | task.on_finish()
528 | return links
529 | except errors.TaskFinish:
530 | logger.debug("TaskFinish", task.url)
531 | task.on_finish()
532 | return {}
533 | except errors.TaskBreak as e:
534 | logger.debug("TaskBack", e.priority, task.url)
535 | task._queue.insert(task.url, e.priority)
536 | return {}
537 | except errors.TaskError as e:
538 | task._queue.report_error(e.__class__.__name__, task.url)
539 | logger.warning("Task报告的异常", str(e), task.url)
540 | return {}
541 | except Exception as e:
542 | if task.on_error(e):
543 | return {}
544 | task._queue.report_error("unknown", task.url)
545 | logger.error(f"Task未处理的异常", "unknown", task.url)
546 | traceback.print_exc()
547 | return {}
548 |
--------------------------------------------------------------------------------
/pyloom/entry.py:
--------------------------------------------------------------------------------
1 | """
2 | 程序入口
3 | 解析命令行参数、配置文件参数,启动对应模块
4 | 所有有关参数解析的操作应当在这里完成
5 | """
6 | import json
7 | import redis
8 | import daemon
9 | import signal
10 | import psutil
11 | import datetime
12 | import argparse
13 | import daemon.pidfile
14 | from .utils import *
15 | from .errors import *
16 | from tabulate import tabulate
17 | from .scheduler import Spider, Queue
18 | from . import drivers, worker, proxy, tasks
19 |
20 | logger = logging.getLogger("entry")
21 |
22 |
23 | def set_defaults(options):
24 | """设置默认值"""
25 | # 设置日志
26 | if hasattr(options, 'log'):
27 | if options.log:
28 | options.log = os.path.abspath(os.path.expanduser(options.log))
29 | else:
30 | root_path = os.path.dirname(os.path.dirname(__file__))
31 | options.log = os.path.join(root_path, 'logs')
32 | os.makedirs(options.log, exist_ok=True)
33 | logging.getLogger("requests").setLevel(logging.WARNING)
34 | patch_logger_format()
35 | if hasattr(options, 'level'):
36 | logging.basicConfig(level=options.level.upper())
37 | # 设置爬虫目录
38 | if hasattr(options, 'spider'):
39 | options.spider = os.path.abspath(os.path.expanduser(options.spider))
40 | setattr(options, 'name', os.path.basename(options.spider))
41 |
42 |
43 | def set_console_logger():
44 | """设置在控制台中输出日志"""
45 | fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s'
46 | date_fmt = '%y%m%d %H:%M:%S'
47 | formatter = logging.Formatter(fmt, date_fmt)
48 | handler = logging.StreamHandler()
49 | handler.setFormatter(formatter)
50 | patch_handler_color(handler)
51 | logging.root.handlers = [handler]
52 |
53 |
54 | def set_file_logger(options, filename):
55 | """
56 | 设置使用文件记录日志
57 | 需在DaemonContext中调用此函数,否则DaemonContext会关闭日志文件导致启动失败
58 | """
59 | fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s'
60 | date_fmt = '%y%m%d %H:%M:%S'
61 | formatter = logging.Formatter(fmt, date_fmt)
62 | handler = TimedRotatingFileHandler(
63 | filename=os.path.join(options.log, filename),
64 | backupCount=options.backup,
65 | when="MIDNIGHT"
66 | )
67 | handler.setFormatter(formatter)
68 | logging.root.handlers = [handler]
69 |
70 |
71 | def handler_common_stop(options, pid_name):
72 | """停止指定进程"""
73 | pidfile = os.path.join(options.log, pid_name)
74 | if not os.path.exists(pidfile):
75 | return "后台进程未启动"
76 | with open(pidfile) as f:
77 | pid = int(f.read())
78 | if pid:
79 | os.kill(pid, signal.SIGINT)
80 | print(f"已发出信号,等待进程退出,pid={pid}")
81 | # 等待进程退出
82 | for _ in range(32):
83 | if not psutil.pid_exists(pid):
84 | return "OK"
85 | time.sleep(1)
86 | else:
87 | return f"ERR: 进程超时未退出,pid={pid}"
88 | else:
89 | return "OK"
90 |
91 |
92 | def handler_common_tail(options, filename):
93 | """查看指定进程的日志"""
94 | logfile = os.path.join(options.log, filename)
95 | if not os.path.exists(logfile):
96 | return "没有日志"
97 | for line in tail(logfile):
98 | print(line, end='')
99 |
100 |
101 | def parse_args(args):
102 | """
103 | 从字符串中解析出多个参数
104 |
105 | >>> parse_args(" a,b,c, ")
106 | ['a', 'b', 'c']
107 | """
108 | if not args:
109 | return []
110 | args = args.replace(",", ",")
111 | return [a.strip() for a in args.split(",") if a.strip()]
112 |
113 |
114 | def handler_proxy_run(options):
115 | """启动代理池节点"""
116 | if options.damon:
117 | pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, 'proxy.pid'))
118 | if pidfile.is_locked():
119 | pid = pidfile.read_pid()
120 | if psutil.pid_exists(pid):
121 | return f"已有实例正在运行,pid={pid}"
122 | else:
123 | pidfile.break_lock()
124 | print("OK")
125 | with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr):
126 | set_file_logger(options, "proxy")
127 | return proxy.start(options.redis)
128 | else:
129 | return proxy.start(options.redis)
130 |
131 |
132 | def handler_proxy_add(options):
133 | """添加代理"""
134 | db = redis.StrictRedis.from_url(options.redis)
135 | # 扫描所有驱动
136 | driver_name_to_title = {}
137 | for driver_name, var in vars(drivers).items():
138 | try:
139 | if issubclass(var, drivers.ProxyDriver) \
140 | and var is not drivers.ProxyDriver \
141 | and hasattr(var, 'title'):
142 | driver_name_to_title[driver_name] = getattr(var, 'title')
143 | except TypeError:
144 | pass
145 | if not driver_name_to_title:
146 | return "ERR: 无可用驱动"
147 | drivers_names = list(driver_name_to_title.items())
148 | # 询问用户,选择驱动
149 | print("请选择代理驱动 (填写序号或英文名称)")
150 | print('\n'.join([f"{i}. {k}, {v}" for i, (k, v) in enumerate(drivers_names)]))
151 | s = input('➜ ')
152 | driver_name = driver_name_to_title.get(s) and s
153 | if driver_name is None:
154 | try:
155 | driver_name = drivers_names[int(s)][0]
156 | except (ValueError, KeyError, IndexError):
157 | return "ERR: 序号或名称错误"
158 | print("当前驱动为: ", driver_name)
159 | driver_cls = getattr(drivers, driver_name)
160 | # 询问配置
161 | proxy_name = template_input([{
162 | "name": "name",
163 | "title": "请为当前配置设置独一无二的名称"
164 | }])['name']
165 | proxy_params = driver_cls.get_params()
166 | # 检查配置名是否重复
167 | if db.hexists("proxy:configs", proxy_name):
168 | s = input(f"配置'{proxy_name}'已存在,是否覆盖 (Y/N) ")
169 | if s.upper() != 'Y':
170 | return 'Bye~'
171 | # 写入配置
172 | proxy_params['version'] = int(time.time())
173 | proxy_params['driver'] = driver_cls.__name__
174 | db.hset("proxy:configs", proxy_name, json.dumps(proxy_params))
175 | return 'OK'
176 |
177 |
178 | def handler_proxy_remove(options):
179 | """删除代理"""
180 | db = redis.StrictRedis.from_url(options.redis)
181 |
182 | if options.name == 'all':
183 | count = db.delete("proxy:configs", *db.keys("proxy:addresses:*"))
184 | else:
185 | count = db.hdel("proxy:configs", options.name)
186 | count += db.delete(f"proxy:addresses:{options.name}")
187 | if count:
188 | return 'OK'
189 | else:
190 | return '没有代理'
191 |
192 |
193 | def handler_proxy_list(options):
194 | """列出所有代理"""
195 | db = redis.StrictRedis.from_url(options.redis)
196 |
197 | configs = db.hgetall("proxy:configs")
198 | if not configs:
199 | return "没有代理"
200 | configs = {k.decode(): json.loads(v) for k, v in configs.items()}
201 | data = [(k, v['driver']) for k, v in configs.items()]
202 | headers = ['配置名', '驱动']
203 | return tabulate(data, headers, 'presto', showindex='always')
204 |
205 |
206 | def handler_run(options):
207 | """运行爬虫"""
208 | db = redis.StrictRedis.from_url(options.redis)
209 | spider_configs = load_spider_configs(options.spider)
210 |
211 | proxies = parse_args(options.proxy)
212 | if proxies:
213 | for proxy_name in proxies:
214 | if not db.hexists("proxy:configs", proxy_name):
215 | return f"ERR: 未找到代理'{proxy_name}'"
216 | logger.info("使用代理运行", proxies)
217 |
218 | if not os.path.exists(os.path.join(options.spider, '__init__.py')):
219 | return "ERR: 未找到爬虫入口:'__init__.py'"
220 |
221 | if options.clear:
222 | logger.info("清空队列与代理数据")
223 | Spider(db, options.name).clear_proxy()
224 | Spider(db, options.name).clear_queue()
225 |
226 | if options.damon:
227 | pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, f'{options.name}.pid'))
228 | if pidfile.is_locked():
229 | pid = pidfile.read_pid()
230 | if psutil.pid_exists(pid):
231 | return f"已有实例正在运行,pid={pid}"
232 | else:
233 | pidfile.break_lock()
234 | logger.info("转入后台运行")
235 | with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr):
236 | set_file_logger(options, options.name)
237 | return worker.start(
238 | options.spider, options.redis, spider_configs,
239 | proxies, options.processes, options.threads
240 | )
241 | else:
242 | return worker.start(
243 | options.spider, options.redis, spider_configs,
244 | proxies, options.processes, options.threads
245 | )
246 |
247 |
248 | def handler_remove(options):
249 | """清除数据"""
250 | db = redis.StrictRedis(options.redis)
251 | spider = Spider(db, options.name)
252 | if options.target == 'queue':
253 | count = spider.clear_queue()
254 | return f"已清除{count}条队列数据"
255 | elif options.target == 'proxy':
256 | count = spider.clear_proxy()
257 | if count:
258 | return "已清除代理数据"
259 | else:
260 | return "没有代理数据"
261 | else:
262 | return f"无法清理:{options.target}"
263 |
264 |
265 | def handler_top(options):
266 | """查看统计"""
267 | db = redis.StrictRedis.from_url(options.redis)
268 | tracking = tasks.Tracking(options.name, db)
269 | lasts = {field: tracking.get(field) for field in sorted(tracking.fields)}
270 | try:
271 | while True:
272 | print(f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', end=' ')
273 | count = db.llen(f"proxy:addresses:{options.name}")
274 | print(f'proxy:{count}', end='; ')
275 | fields = sorted(tracking.fields)
276 | for field in fields:
277 | last = lasts.get(field, None)
278 | current = tracking.get(field)
279 | lasts[field] = current
280 | if last is None:
281 | print(f"{field}:None", end='; ')
282 | else:
283 | print(f"{field}:{round((current-last)/options.interval, 1)}", end='; ')
284 | print(end='\n')
285 | time.sleep(options.interval)
286 | except KeyboardInterrupt:
287 | return
288 |
289 |
290 | def handler_tag_list(options):
291 | """查看所有异常标签"""
292 | db = redis.StrictRedis.from_url(options.redis)
293 | if not Spider(db, options.name).exists():
294 | return "爬虫不存在"
295 | queue = Queue(db, options.name)
296 | if options.tag:
297 | data = [(d,) for d in queue.get_errors(options.tag, 0)]
298 | return tabulate(data, ['URL'], 'presto', showindex='always')
299 | else:
300 | tags = queue.tags
301 | if not tags:
302 | return "没有标签"
303 | else:
304 | data = sorted(tags.items(), key=lambda t: t[1], reverse=True)
305 | headers = ['标签', '数量']
306 | return tabulate(data, headers, 'presto', showindex='always')
307 |
308 |
309 | def handler_tag_remove(options):
310 | """移除异常标签"""
311 | db = redis.StrictRedis.from_url(options.redis)
312 | if not Spider(db, options.name).exists():
313 | return "爬虫不存在"
314 | queue = Queue(db, options.name)
315 | if options.tags == 'all':
316 | tags = queue.tags
317 | else:
318 | tags = parse_args(options.tags)
319 | if not tags:
320 | return '没有标签'
321 | for tag in tags:
322 | if queue.remove_tag(tag):
323 | print(f"已删除标签'{tag}'")
324 | else:
325 | print(f"未找到标签'{tag}'")
326 | return "OK"
327 |
328 |
329 | def handler_tag_rollback(options):
330 | """回滚异常标签下的所有任务"""
331 | db = redis.StrictRedis.from_url(options.redis)
332 | if not Spider(db, options.name).exists():
333 | return "爬虫不存在"
334 | queue = Queue(db, options.name)
335 | if options.tags == 'all':
336 | tags = queue.tags
337 | else:
338 | tags = parse_args(options.tags)
339 | if tags:
340 | for tag in tags:
341 | count = queue.rollback_tag(tag, 0)
342 | return f"回滚'{tag}', 数量:{count}, 队列优先级:0"
343 | else:
344 | return "未指定标签"
345 |
346 |
347 | def main():
348 | # parents
349 | log = argparse.ArgumentParser(add_help=False)
350 | log.add_argument('-l', '--level', default='info', help='日志级别')
351 | log.add_argument('--log', help='存放日志文件的目录')
352 | log.add_argument('--backup', type=int, default=3, help='日志文件保留数量')
353 | spider = argparse.ArgumentParser(add_help=False)
354 | spider.add_argument('-s', '--spider', default='./', help='指定爬虫目录')
355 | db = argparse.ArgumentParser(add_help=False)
356 | db.add_argument('-r', '--redis', default='redis://127.0.0.1:6379/0', help='指定redis地址')
357 |
358 | # pyloom
359 | node = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
360 | node.set_defaults(module=None)
361 | node_modules = node.add_subparsers()
362 |
363 | # pyloom run
364 | node_run = node_modules.add_parser('run', help='运行爬虫', parents=[log, spider, db])
365 | node_run.set_defaults(module='run')
366 | node_run.add_argument('-C', '--clear', action="store_true", help='清空爬虫数据后运行')
367 | node_run.add_argument('--proxy', help='使用指定代理运行,逗号分隔多个代理')
368 | node_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行')
369 | node_run.add_argument('-p', '--processes', default=2, type=int, help='子进程数量')
370 | node_run.add_argument('-t', '--threads', default=40, type=int, help='每个子进程的线程数量')
371 | # pyloom stop
372 | node_stop = node_modules.add_parser('stop', help='停止后台运行的爬虫', parents=[spider])
373 | node_stop.set_defaults(module='stop')
374 | # pyloom tail
375 | node_tail = node_modules.add_parser('tail', help='查看日志文件', parents=[log, spider])
376 | node_tail.set_defaults(module='tail')
377 | # pyloom top
378 | node_top = node_modules.add_parser('top', help='查看统计', parents=[spider, db])
379 | node_top.set_defaults(module='top')
380 | node_top.add_argument('-i', '--interval', default=10, type=int, help='抽样间隔')
381 | # pyloom remove
382 | node_remove = node_modules.add_parser('remove', help='清除爬虫数据')
383 | node_remove.set_defaults(module='remove')
384 | node_remove.set_defaults(target=None)
385 | node_remove_targets = node_remove.add_subparsers()
386 | # pyloom remove queue
387 | node_remove_queue = node_remove_targets.add_parser('queue', help='清除队列数据', parents=[spider, db])
388 | node_remove_queue.set_defaults(target='queue')
389 | # pyloom remove proxy
390 | node_remove_proxy = node_remove_targets.add_parser('proxy', help='清空代理池', parents=[spider, db])
391 | node_remove_proxy.set_defaults(target='proxy')
392 | # pyloom tag
393 | node_tag = node_modules.add_parser('tag', help='标签管理')
394 | node_tag.set_defaults(module='tag')
395 | node_tag.set_defaults(command=None)
396 | node_tag_commands = node_tag.add_subparsers()
397 | # pyloom tag list
398 | node_tag_list = node_tag_commands.add_parser('list', help='查看标签', parents=[spider, db])
399 | node_tag_list.set_defaults(command='list')
400 | node_tag_list.add_argument('tag', nargs='?', help='列出指定标签的内容,留空显示标签列表')
401 | # pyloom tag remove
402 | node_tag_remove = node_tag_commands.add_parser('remove', help='清除标签', parents=[spider, db])
403 | node_tag_remove.set_defaults(command='remove')
404 | node_tag_remove.add_argument('tags', help='被清除的标签,逗号分隔多个标签')
405 | # pyloom rollback :tag
406 | node_tag_rollback = node_tag_commands.add_parser('rollback', help='回滚标签', parents=[spider, db])
407 | node_tag_rollback.set_defaults(command='rollback')
408 | node_tag_rollback.add_argument('tags', help='被回滚的标签,逗号分隔多个标签')
409 |
410 | # pyloom proxy
411 | node_proxy = node_modules.add_parser('proxy', help='代理节点')
412 | node_proxy.set_defaults(module='proxy')
413 | node_proxy.set_defaults(command=None)
414 | node_proxy_commands = node_proxy.add_subparsers()
415 | # pyloom proxy run
416 | node_proxy_run = node_proxy_commands.add_parser('run', help='启动代理节点', parents=[log, db])
417 | node_proxy_run.set_defaults(command='run')
418 | node_proxy_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行')
419 | # pyloom proxy stop
420 | node_proxy_stop = node_proxy_commands.add_parser('stop', help='停止节点', parents=[log])
421 | node_proxy_stop.set_defaults(command='stop')
422 | # pyloom proxy tail
423 | node_proxy_tail = node_proxy_commands.add_parser('tail', help='查看日志', parents=[log])
424 | node_proxy_tail.set_defaults(command='tail')
425 | # pyloom proxy add
426 | node_proxy_add = node_proxy_commands.add_parser('add', help='添加代理', parents=[db])
427 | node_proxy_add.set_defaults(command='add')
428 | # pyloom proxy remove
429 | node_proxy_remove = node_proxy_commands.add_parser('remove', help='删除指定代理', parents=[db])
430 | node_proxy_remove.set_defaults(command='remove')
431 | node_proxy_remove.add_argument('name', help='欲删除的代理名称,all表示所有代理')
432 | # pyloom proxy list
433 | node_proxy_list = node_proxy_commands.add_parser('list', help='列出所有配置', parents=[db])
434 | node_proxy_list.set_defaults(command='list')
435 |
436 | # 路由至对应模块
437 | options = node.parse_args()
438 | try:
439 | set_defaults(options)
440 | set_console_logger()
441 | if options.module == 'proxy':
442 | if options.command == 'run':
443 | return handler_proxy_run(options)
444 | elif options.command == 'stop':
445 | return handler_common_stop(options, 'proxy.pid')
446 | elif options.command == 'tail':
447 | return handler_common_tail(options, 'proxy')
448 | elif options.command == 'add':
449 | return handler_proxy_add(options)
450 | elif options.command == 'remove':
451 | return handler_proxy_remove(options)
452 | elif options.command == 'list':
453 | return handler_proxy_list(options)
454 | else:
455 | return node_proxy.print_help()
456 | elif options.module == 'run':
457 | return handler_run(options)
458 | elif options.module == 'stop':
459 | return handler_common_stop(options, f'{options.name}.pid')
460 | elif options.module == 'remove':
461 | return handler_remove(options)
462 | elif options.module == 'top':
463 | return handler_top(options)
464 | elif options.module == 'tail':
465 | return handler_common_tail(options, options.name)
466 | elif options.module == 'tag':
467 | if options.command == 'list':
468 | return handler_tag_list(options)
469 | elif options.command == 'remove':
470 | return handler_tag_remove(options)
471 | elif options.command == 'rollback':
472 | return handler_tag_rollback(options)
473 | else:
474 | return node_tag.print_help()
475 | else:
476 | return node.print_help()
477 | except ConfigFileNotFoundError as e:
478 | return f'ERR: {str(e)}'
479 |
--------------------------------------------------------------------------------
/pyloom/user-agent.json:
--------------------------------------------------------------------------------
1 | {
2 | "chrome": [
3 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
4 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
5 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
6 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
7 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
8 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
9 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
10 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
11 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
13 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
14 | "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
15 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
16 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
17 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
18 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
19 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
20 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
21 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
23 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
25 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
26 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
27 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
29 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
30 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
32 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
34 | "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
35 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
36 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
38 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
40 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
41 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
42 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
43 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
45 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
46 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
47 | "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
48 | "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
49 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
50 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
52 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
53 | ],
54 | "firefox": [
55 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
56 | "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
57 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
58 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
59 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
60 | "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
61 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
62 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
63 | "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
64 | "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
65 | "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
66 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
67 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
68 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
69 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
70 | "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
71 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
72 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
73 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
74 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
75 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
76 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
77 | "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
78 | "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
79 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
80 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
81 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
82 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
83 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
84 | "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
85 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
86 | "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
87 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
88 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
89 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
90 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
91 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
92 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
93 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
94 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
95 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
96 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
97 | "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
98 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
99 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
100 | "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
101 | "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
102 | "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
103 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
104 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6"
105 | ],
106 | "safari": [
107 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
108 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
109 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
110 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
111 | "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3",
112 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
113 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
114 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
115 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
116 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
117 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
118 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
119 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
120 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
121 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
122 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
123 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
124 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
125 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
126 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
127 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
128 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
129 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
130 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
131 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
132 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
133 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
134 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
135 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
136 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
137 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
138 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
139 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
140 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
141 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
142 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
143 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
144 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
145 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
146 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
147 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
148 | "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
149 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
150 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
151 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
152 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
153 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
154 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
155 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
156 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5"
157 | ],
158 | "ie": [
159 | "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
160 | "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
161 | "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
162 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
163 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
164 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
165 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
166 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
167 | "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
168 | "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
169 | "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
170 | "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
171 | "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
172 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
173 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
174 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
175 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
176 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
177 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
178 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
179 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
180 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
181 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
182 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
183 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
184 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
185 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
186 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
187 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
188 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
189 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
190 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
191 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
192 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
193 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
194 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
195 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
196 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
197 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
198 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
199 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
200 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
201 | "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
202 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
203 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
204 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
205 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
206 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
207 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
208 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)"
209 | ],
210 | "android": [
211 | "android Mozilla/5.0 (Linux; Android 8.0.0; ATU-AL10 Build/HUAWEIATU-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36 phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/hw",
212 | "android Mozilla/5.0 (Linux; Android 8.1.0; ONEPLUS A5000 Build/OPM1.171019.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36 phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/oppo"
213 | ]
214 | }
215 |
--------------------------------------------------------------------------------