├── .gitignore
├── README.md
├── pyloom
    ├── __init__.py
    ├── __main__.py
    ├── buckets.py
    ├── drivers.py
    ├── entry.py
    ├── errors.py
    ├── lua
    │   ├── bloom_cas.lua
    │   ├── bloom_check.lua
    │   ├── url_add.lua
    │   └── url_pop.lua
    ├── proxy.py
    ├── scheduler.py
    ├── tasks.py
    ├── user-agent.json
    ├── utils.py
    └── worker.py
├── setup.py
└── spiders
    ├── DouBan250
        ├── README.md
        ├── __init__.py
        └── configs.py
    ├── DouBanBooks
        ├── README.md
        ├── __init__.py
        ├── configs.py
        └── tasks.py
    ├── LaGou
        ├── README.md
        ├── __init__.py
        ├── configs.py
        └── tasks.py
    ├── PinDuoDuo
        ├── README.md
        ├── __init__.py
        ├── configs.py
        └── tasks.py
    ├── PinDuoDuoWEB
        ├── README.md
        ├── __init__.py
        ├── configs.py
        ├── get_anticontent.js
        └── tasks.py
    ├── WeiBo
        ├── README.md
        ├── __init__.py
        ├── configs.py
        └── tasks.py
    └── Ziroom
        ├── README.md
        ├── __init__.py
        ├── configs.py
        └── tasks.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *,cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # IPython Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # dotenv
 80 | .env
 81 | 
 82 | # virtualenv
 83 | venv/
 84 | ENV/
 85 | 
 86 | # Spyder project settings
 87 | .spyderproject
 88 | 
 89 | # Rope project settings
 90 | .ropeproject
 91 | 
 92 | ### JetBrains template
 93 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 94 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 95 | 
 96 | # User-specific stuff:
 97 | .idea/
 98 | .DS_Store
 99 | 
100 | # Sensitive or high-churn files:
101 | .idea/dataSources.ids
102 | .idea/dataSources.xml
103 | .idea/dataSources.local.xml
104 | .idea/sqlDataSources.xml
105 | .idea/dynamic.xml
106 | .idea/uiDesigner.xml
107 | 
108 | # Gradle:
109 | .idea/gradle.xml
110 | .idea/libraries
111 | 
112 | # Mongo Explorer plugin:
113 | .idea/mongoSettings.xml
114 | 
115 | ## File-based project format:
116 | *.iws
117 | 
118 | ## Plugin-specific files:
119 | 
120 | # IntelliJ
121 | /out/
122 | 
123 | # mpeltonen/sbt-idea plugin
124 | .idea_modules/
125 | 
126 | # JIRA plugin
127 | atlassian-ide-plugin.xml
128 | 
129 | # Crashlytics plugin (for Android Studio and IntelliJ)
130 | com_crashlytics_export_strings.xml
131 | crashlytics.properties
132 | crashlytics-build.properties
133 | fabric.properties
134 | 
135 | # pytest
136 | .pytest_cache/
137 | 
138 | # pyloom
139 | __dev_*.py
140 | __debugger__/
141 | logs/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyLoom，爬龙！
 2 | 
 3 | PyLoom想为有价值的网站编写爬虫，让开发者便捷地获取结构化的数据。
 4 | 
 5 | PyLoom由三个部分组成，
 6 | 
 7 | 1. 框架，减少编写、运行、维护爬虫的工作量。
 8 | 
 9 | 2. 爬虫，寻找有价值的目标为其开发爬虫，并维护既有爬虫的可用性。
10 | 
11 |    预期19年底，PyLoom将拥有围绕电子商务、房屋租售、社交网络、新闻媒体的数十个爬虫。
12 | 
13 | 3. 升级爬虫，对于频繁使用的爬虫，增强其能力
14 |    + 增强定制能力，例如支持限定地区、类别、关键字抓取；
15 |    + 增强抓取策略，减少对代理、打码接口的使用；
16 |    + 增强更新策略，更细粒度地计算重复抓取的时间。
17 | 
18 | 目前进度，
19 | 
20 | ①部分完成，开发常见爬虫够用了，随爬虫的开发迭代出更多功能；
21 | 
22 | ②已有几款爬虫，放置于`spiders`目录。
23 | 
24 | 
25 | 
26 | ## 安装
27 | 
28 | 1. **环境要求**
29 | 
30 |    + python 3.6.0+
31 |    + redis 2.6+
32 |    + 类unix系统
33 | 
34 | 2. **安装PyLoom**
35 | 
36 |    ```bash
37 |    git clone https://github.com/spencer404/PyLoom.git
38 |    python3.6 -m pip install -e ./PyLoom
39 |    ```
40 | 
41 |    > 添加 `-i https://pypi.douban.com/simple` 参数，利用豆瓣镜像提速。
42 | 
43 |    >出现错误`fatal error: Python.h: No such file or directory`时，
44 |    >
45 |    >需安装对应平台的python3.x-devel包
46 |    >
47 | 
48 | 
49 | 
50 | ## 运行
51 | 
52 | 以运行`spiders/WeiBo`为例，
53 | 
54 | 1. **最简参数启动爬虫**
55 | 
56 |    ```bash
57 |    pyloom run -s PyLoom/spiders/WeiBo
58 |    ```
59 | 
60 |    >在爬虫目录中执行`run`时，可省略`-s`参数。
61 | 
62 | 2. **启动代理池**
63 | 
64 |    ```bash
65 |    pyloom proxy run
66 |    ```
67 | 
68 | 3. **添加代理**
69 | 
70 |    根据命令提示，添加名为"xxx"的代理
71 | 
72 |    ```bash
73 |    pyloom proxy add
74 |    ```
75 | 
76 | 4. **使用代理启动爬虫**
77 | 
78 |    ```bash
79 |    pyloom run --proxy xxx
80 |    ```
81 | 
82 |    命令`run`的部分常用参数：
83 | 
84 |    ```bash
85 |    -l, --level    日志级别
86 |    -s, --spider   指定爬虫目录
87 |    -r, --redis    指定redis地址(URL形式)
88 |    -C, --clear    清空队列、代理数据后运行
89 |    --proxy        使用指定代理运行，逗号分隔多个代理
90 |    --damon        作为守护进程运行
91 |    -p             子进程数量
92 |    -t             每个子进程的线程数量
93 |    ```
94 | 
95 |    在多台服务器上运行时，若参数`-s、-r`所指向的目标相同，即可横向扩容性能。
96 | 
97 |    默认地，PyLoom将抓到数据打印在日志中，你可以修改`on_save`函数自定义如何保存。


--------------------------------------------------------------------------------
/pyloom/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import Task
2 | 


--------------------------------------------------------------------------------
/pyloom/__main__.py:
--------------------------------------------------------------------------------
1 | """作为模块启动"""
2 | if __name__ == '__main__':
3 |     from .entry import main
4 | 
5 |     main()
6 | 


--------------------------------------------------------------------------------
/pyloom/buckets.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import json
  3 | import fnmatch
  4 | import threading
  5 | from .errors import BucketError
  6 | from redis import StrictRedis, exceptions
  7 | 
  8 | 
  9 | class LocalBucket(object):
 10 |     """进程内存储，重启后数据丢失"""
 11 |     _lock = None
 12 |     _instances = {}
 13 | 
 14 |     def __init__(self):
 15 |         self._db = {}
 16 |         if LocalBucket._lock is None:
 17 |             LocalBucket._lock = threading.Lock()
 18 | 
 19 |     @classmethod
 20 |     def instance(cls, name):
 21 |         """获取单例"""
 22 |         var = LocalBucket._instances.get(name, None)
 23 |         if var:
 24 |             return var
 25 |         var = LocalBucket()
 26 |         LocalBucket._instances[name] = var
 27 |         return var
 28 | 
 29 |     @classmethod
 30 |     def purge(cls):
 31 |         """清理由instance创建的所有实例的过期key，返回被清理的数量"""
 32 |         count = 0
 33 |         for instance in cls._instances.values():
 34 |             count += instance._purge()
 35 |         return count
 36 | 
 37 |     def _purge(self):
 38 |         """清理实例中过期的key，返回被清理的数量"""
 39 |         keys = []
 40 |         for key, (_, expire_at) in self._db.items():
 41 |             if expire_at is not None and expire_at <= time.time():
 42 |                 keys.append(key)
 43 |         for key in keys:
 44 |             del self._db[key]
 45 |         return len(keys)
 46 | 
 47 |     def set(self, key, value, ttl=None):
 48 |         """为key设置value，ttl秒后失效"""
 49 |         item = self._db.get(key, None)
 50 |         if item is None or ttl is not None:
 51 |             # 更改value和ttl
 52 |             if ttl is None:
 53 |                 expire_at = None
 54 |             else:
 55 |                 expire_at = time.time() + ttl
 56 |             self._db[key] = [value, expire_at]
 57 |         else:
 58 |             # 只更改ttl
 59 |             self._db[key][0] = value
 60 | 
 61 |     def delete(self, *keys) -> int:
 62 |         """删除一个或多个key，返回被删除的数量"""
 63 |         count = 0
 64 |         for key in keys:
 65 |             item = self._db.get(key, None)
 66 |             # 忽略不存在的key
 67 |             if item is None:
 68 |                 continue
 69 |             expire_at = item[1]
 70 |             if expire_at is None or expire_at > time.time():
 71 |                 del self._db[key]
 72 |                 count += 1
 73 |             else:  # 键已过期，不累加计数器
 74 |                 del self._db[key]
 75 |         return count
 76 | 
 77 |     def get(self, key) -> object:
 78 |         """返回key的value，当key不存在时返回None"""
 79 |         item = self._db.get(key, None)
 80 |         if item is None:
 81 |             return None
 82 |         value, expire_at = item
 83 |         if expire_at is None:
 84 |             return value
 85 |         elif expire_at > time.time():
 86 |             return value
 87 |         else:  # 键已过期
 88 |             del self._db[key]
 89 |             return None
 90 | 
 91 |     def getset(self, key, value) -> object:
 92 |         """为给定key设置新value，返回旧value"""
 93 |         old_value = self.get(key)
 94 |         self.set(key, value)
 95 |         return old_value
 96 | 
 97 |     def keys(self, pattern='*') -> list:
 98 |         """
 99 |         返回满足pattern的所有键
100 |         pattern支持通配符：?、*、[]
101 |         """
102 |         expired_keys = []
103 |         valid_keys = []
104 |         n = time.time()
105 |         for key, (_, expire_at) in self._db.items():
106 |             if expire_at is not None and expire_at <= n:
107 |                 expired_keys.append(key)
108 |             else:
109 |                 if fnmatch.fnmatch(key, pattern):
110 |                     valid_keys.append(key)
111 |         for key in expired_keys:
112 |             del self._db[key]
113 |         return valid_keys
114 | 
115 |     def expire(self, key, ttl) -> bool:
116 |         """为给定key设置生存时间，ttl秒后被自动删除"""
117 |         item = self._db.get(key, None)
118 |         if item is None:
119 |             return False
120 |         _, expire_at = item
121 |         if expire_at is None or expire_at >= time.time():
122 |             self._db[key][1] = ttl + time.time()
123 |             return True
124 |         else:  # 键已过期
125 |             del self._db[key]
126 |             return False
127 | 
128 |     def ttl(self, key) -> int:
129 |         """
130 |         返回给定key的剩余生存时间
131 |         Returns:
132 |             当key不存在时，返回-2;
133 |             当key存在但没有设置剩余生存时间时，返回-1;
134 |             否则，返回key的剩余生存时间
135 |         """
136 |         item = self._db.get(key, None)
137 |         if item is None:
138 |             return -2
139 |         value, expire_at = item
140 |         if expire_at is None:
141 |             return -1
142 |         elif expire_at > time.time():
143 |             return expire_at - time.time()
144 |         else:  # 键已过期
145 |             del self._db[key]
146 |             return -2
147 | 
148 |     def incr(self, key, amount=1) -> int:
149 |         """
150 |         将给定key的值加上amount，返回incr后的值
151 |         若key不存在，key被先初始化为0，再incr
152 |         若value非int型，抛出异常
153 |         """
154 |         with LocalBucket._lock:
155 |             old_value = self.get(key)
156 |             if old_value is None:
157 |                 self.set(key, 0, None)
158 |                 old_value = 0
159 |             elif not isinstance(old_value, int):
160 |                 raise BucketError("incr应作用于int型的值")
161 |             new_value = old_value + amount
162 |             self.set(key, new_value)
163 |             return new_value
164 | 
165 | 
166 | class ShareBucket(object):
167 |     """共享存储，利用redis存储，不易失"""
168 |     prefix = "bucket"
169 | 
170 |     def __init__(self, db: StrictRedis, name):
171 |         self._db = db
172 |         self.name = name
173 |         self.key_prefix = f"{self.prefix}:{name}"
174 | 
175 |     def set(self, key, value, ttl=None):
176 |         """为key设置value，ttl秒后失效"""
177 |         self._db.set(f"{self.key_prefix}:{key}", json.dumps(value), ex=ttl)
178 | 
179 |     def delete(self, *keys) -> int:
180 |         """删除一个或多个key，返回被删除的数量"""
181 |         return self._db.delete(*[f"{self.key_prefix}:{k}" for k in keys])
182 | 
183 |     def get(self, key) -> object:
184 |         """返回key的value，当key不存在时返回None"""
185 |         res = self._db.get(f"{self.key_prefix}:{key}")
186 |         if res:
187 |             return json.loads(res)
188 |         else:
189 |             return res
190 | 
191 |     def getset(self, key, value) -> object:
192 |         """为给定key设置新value，返回旧value"""
193 |         res = self._db.getset(f"{self.key_prefix}:{key}", value)
194 |         if res:
195 |             return json.loads(res)
196 |         else:
197 |             return res
198 | 
199 |     def keys(self, pattern='*') -> list:
200 |         """
201 |         返回满足pattern的所有键
202 |         pattern支持通配符：?、*、[]
203 |         """
204 |         p = len(f"{self.key_prefix}:")
205 |         res = self._db.keys(f"{self.key_prefix}:{pattern}")
206 |         return [r.decode()[p:] for r in res]
207 | 
208 |     def expire(self, key, ttl) -> bool:
209 |         """为给定key设置生存时间，ttl秒后被自动删除"""
210 |         return self._db.expire(f"{self.key_prefix}:{key}", ttl)
211 | 
212 |     def ttl(self, key) -> int:
213 |         """
214 |         返回给定key的剩余生存时间
215 |         Returns:
216 |             当key不存在时，返回-2;
217 |             当key存在但没有设置剩余生存时间时，返回-1;
218 |             否则，返回key的剩余生存时间
219 |         """
220 |         return self._db.ttl(f"{self.key_prefix}:{key}")
221 | 
222 |     def incr(self, key, amount=1) -> int:
223 |         """
224 |         将给定key的值加上amount，返回incr后的值
225 |         若key不存在，key被先初始化为0，再incr
226 |         若value非int型，抛出异常
227 |         """
228 |         try:
229 |             return self._db.incr(f"{self.key_prefix}:{key}", amount)
230 |         except exceptions.ResponseError as e:
231 |             if e.args[0] == 'value is not an integer or out of range':
232 |                 raise BucketError("incr应作用于int型的值")
233 | 
234 |     def lpush(self, key, *values) -> int:
235 |         """
236 |         将一个或多个值value插入到列表key的表头
237 |         返回执行LPUSH命令后，列表的长度。
238 |         """
239 |         return self._db.lpush(f"{self.key_prefix}:{key}", *values)
240 | 
241 |     def lrange(self, key, start, end) -> list:
242 |         """
243 |         返回列表 key 中指定区间内的元素，区间以偏移量 start 和 stop 指定。
244 |         包含指定区间内的元素的list
245 |         """
246 |         return self._db.lrange(f"{self.key_prefix}:{key}", start, end)
247 | 
248 |     def lock(self, key, timeout, **kwargs):
249 |         """分布式锁"""
250 |         return self._db.lock(f"{self.key_prefix}:{key}", timeout, **kwargs)
251 | 


--------------------------------------------------------------------------------
/pyloom/drivers.py:
--------------------------------------------------------------------------------
  1 | import furl
  2 | import time
  3 | import logging
  4 | import requests
  5 | import traceback
  6 | from . import utils
  7 | 
  8 | logger = logging.getLogger("drivers")
  9 | 
 10 | 
 11 | class ProxyDriver(object):
 12 |     """代理驱动的基类，必须继承此类，否则驱动不能被识别"""
 13 | 
 14 |     def __init__(self, **kwargs):
 15 |         """在代理启动时传入自定义参数"""
 16 |         self.url = kwargs['url']
 17 |         self.interval = kwargs['interval']
 18 |         self.parallel = kwargs['parallel']
 19 | 
 20 |     @classmethod
 21 |     def get_params(cls):
 22 |         """获取自定义参数"""
 23 |         template = [
 24 |             {
 25 |                 'name': 'url',
 26 |                 'title': '代理提取接口？',
 27 |                 'example': 'http://api.example.com',
 28 |                 'regex': 'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 29 |             },
 30 |             {
 31 |                 'name': 'interval',
 32 |                 'title': '每隔多少秒调用一次接口？',
 33 |                 'type': int,
 34 |                 'note': "0-无间隔"
 35 |             },
 36 |             {
 37 |                 'name': 'parallel',
 38 |                 'title': '每个代理能被多少线程并发使用？',
 39 |                 'type': int
 40 |             }
 41 |         ]
 42 |         return utils.template_input(template)
 43 | 
 44 |     def gen_addresses(self):
 45 |         """
 46 |         返回一个生成器，每次迭代时返回一个代理，其格式为：
 47 |             valid_at:expire_at:address
 48 |         valid_at : 当前时间大于valid_at时代理可用
 49 |         expire_at: 当前时间小于expire_at时代理可用，大于expire_at时被删除
 50 |         address  : 代理地址，支持http、https、socks5协议
 51 |         """
 52 |         raise NotImplementedError
 53 | 
 54 | 
 55 | class MoGuProxy(ProxyDriver):
 56 |     title = '蘑菇API代理'
 57 | 
 58 |     def gen_addresses(self):
 59 |         logger.info("代理已启动", self.title, self.url)
 60 |         while True:
 61 |             try:
 62 |                 time.sleep(self.interval / 2)  # 接口故障时睡眠一半时间
 63 |                 try:
 64 |                     resp = requests.get(self.url, timeout=1)
 65 |                 except Exception as e:
 66 |                     yield False, f"接口请求异常:{e}"
 67 |                     continue
 68 | 
 69 |                 if resp.status_code != 200:
 70 |                     yield False, f"接口状态码异常:{resp.status_code}"
 71 |                     continue
 72 | 
 73 |                 try:
 74 |                     data = resp.json()
 75 |                 except Exception:
 76 |                     yield False, f"接口返回值非JSON格式"
 77 |                     continue
 78 | 
 79 |                 if int(data.get('code', -1)) != 0:
 80 |                     yield False, f'接口返回异常:{data.get("msg", "unknown")}'
 81 |                     continue
 82 | 
 83 |                 expire_at = time.time() + 600
 84 |                 addresses = [f"0:{expire_at}:http://{i['ip']}:{i['port']}" for i in data.get('msg', [])]
 85 |                 yield True, addresses * self.parallel
 86 |                 time.sleep(self.interval / 2)
 87 |             except Exception as e:
 88 |                 logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc())
 89 | 
 90 | 
 91 | class MiPuProxy(ProxyDriver):
 92 |     title = "米扑开放代理"
 93 | 
 94 |     def gen_addresses(self):
 95 |         logger.info("代理已启动", self.title, self.url)
 96 |         while True:
 97 |             try:
 98 |                 url = furl.furl(self.url)
 99 |                 url.query.params.set('result_format', 'json')
100 |                 time.sleep(self.interval / 2)  # 接口故障时睡眠一半时间
101 |                 try:
102 |                     resp = requests.get(url, timeout=1)
103 |                 except Exception as e:
104 |                     yield False, f"接口请求异常:{e}"
105 |                     continue
106 | 
107 |                 if resp.status_code != 200:
108 |                     yield False, f"接口状态码异常:{resp.status_code}"
109 |                     continue
110 | 
111 |                 try:
112 |                     data = resp.json()
113 |                 except Exception:
114 |                     yield False, f"接口返回值非JSON格式"
115 |                     continue
116 | 
117 |                 if int(data.get('code', -1)) != 0:
118 |                     yield False, f'接口返回异常:{data.get("msg", "unknown")}'
119 |                     continue
120 | 
121 |                 expire_at = time.time() + 60 * 60 * 24 * 30 * 12  # 有效期一年
122 |                 addresses = []
123 |                 for item in data.get('result', []):
124 |                     scheme = item['http_type'].lower()
125 |                     server = item['ip:port']
126 |                     addresses.append(f"0:{expire_at}:{scheme}://{server}")
127 | 
128 |                 yield True, addresses * self.parallel
129 |                 time.sleep(self.interval / 2)
130 |             except Exception as e:
131 |                 logger.error("未处理的异常", type(e), e, '\n', traceback.format_exc())
132 | 


--------------------------------------------------------------------------------
/pyloom/entry.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 程序入口
  3 | 解析命令行参数、配置文件参数，启动对应模块
  4 | 所有有关参数解析的操作应当在这里完成
  5 | """
  6 | import json
  7 | import redis
  8 | import daemon
  9 | import signal
 10 | import psutil
 11 | import datetime
 12 | import argparse
 13 | import daemon.pidfile
 14 | from .utils import *
 15 | from .errors import *
 16 | from tabulate import tabulate
 17 | from .scheduler import Spider, Queue
 18 | from . import drivers, worker, proxy, tasks
 19 | 
 20 | logger = logging.getLogger("entry")
 21 | 
 22 | 
 23 | def set_defaults(options):
 24 |     """设置默认值"""
 25 |     # 设置日志
 26 |     if hasattr(options, 'log'):
 27 |         if options.log:
 28 |             options.log = os.path.abspath(os.path.expanduser(options.log))
 29 |         else:
 30 |             root_path = os.path.dirname(os.path.dirname(__file__))
 31 |             options.log = os.path.join(root_path, 'logs')
 32 |             os.makedirs(options.log, exist_ok=True)
 33 |     logging.getLogger("requests").setLevel(logging.WARNING)
 34 |     patch_logger_format()
 35 |     if hasattr(options, 'level'):
 36 |         logging.basicConfig(level=options.level.upper())
 37 |     # 设置爬虫目录
 38 |     if hasattr(options, 'spider'):
 39 |         options.spider = os.path.abspath(os.path.expanduser(options.spider))
 40 |         setattr(options, 'name', os.path.basename(options.spider))
 41 | 
 42 | 
 43 | def set_console_logger():
 44 |     """设置在控制台中输出日志"""
 45 |     fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s'
 46 |     date_fmt = '%y%m%d %H:%M:%S'
 47 |     formatter = logging.Formatter(fmt, date_fmt)
 48 |     handler = logging.StreamHandler()
 49 |     handler.setFormatter(formatter)
 50 |     patch_handler_color(handler)
 51 |     logging.root.handlers = [handler]
 52 | 
 53 | 
 54 | def set_file_logger(options, filename):
 55 |     """
 56 |     设置使用文件记录日志
 57 |     需在DaemonContext中调用此函数，否则DaemonContext会关闭日志文件导致启动失败
 58 |     """
 59 |     fmt = fr'[%(levelname)1.1s][%(asctime)s.%(msecs)03d][%(name)s] %(message)s'
 60 |     date_fmt = '%y%m%d %H:%M:%S'
 61 |     formatter = logging.Formatter(fmt, date_fmt)
 62 |     handler = TimedRotatingFileHandler(
 63 |         filename=os.path.join(options.log, filename),
 64 |         backupCount=options.backup,
 65 |         when="MIDNIGHT"
 66 |     )
 67 |     handler.setFormatter(formatter)
 68 |     logging.root.handlers = [handler]
 69 | 
 70 | 
 71 | def handler_common_stop(options, pid_name):
 72 |     """停止指定进程"""
 73 |     pidfile = os.path.join(options.log, pid_name)
 74 |     if not os.path.exists(pidfile):
 75 |         return "后台进程未启动"
 76 |     with open(pidfile) as f:
 77 |         pid = int(f.read())
 78 |     if pid:
 79 |         os.kill(pid, signal.SIGINT)
 80 |         print(f"已发出信号，等待进程退出，pid={pid}")
 81 |         # 等待进程退出
 82 |         for _ in range(32):
 83 |             if not psutil.pid_exists(pid):
 84 |                 return "OK"
 85 |             time.sleep(1)
 86 |         else:
 87 |             return f"ERR: 进程超时未退出，pid={pid}"
 88 |     else:
 89 |         return "OK"
 90 | 
 91 | 
 92 | def handler_common_tail(options, filename):
 93 |     """查看指定进程的日志"""
 94 |     logfile = os.path.join(options.log, filename)
 95 |     if not os.path.exists(logfile):
 96 |         return "没有日志"
 97 |     for line in tail(logfile):
 98 |         print(line, end='')
 99 | 
100 | 
101 | def parse_args(args):
102 |     """
103 |     从字符串中解析出多个参数
104 | 
105 |     >>> parse_args(" a,b，c,   ")
106 |     ['a', 'b', 'c']
107 |     """
108 |     if not args:
109 |         return []
110 |     args = args.replace("，", ",")
111 |     return [a.strip() for a in args.split(",") if a.strip()]
112 | 
113 | 
114 | def handler_proxy_run(options):
115 |     """启动代理池节点"""
116 |     if options.damon:
117 |         pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, 'proxy.pid'))
118 |         if pidfile.is_locked():
119 |             pid = pidfile.read_pid()
120 |             if psutil.pid_exists(pid):
121 |                 return f"已有实例正在运行，pid={pid}"
122 |             else:
123 |                 pidfile.break_lock()
124 |         print("OK")
125 |         with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr):
126 |             set_file_logger(options, "proxy")
127 |             return proxy.start(options.redis)
128 |     else:
129 |         return proxy.start(options.redis)
130 | 
131 | 
132 | def handler_proxy_add(options):
133 |     """添加代理"""
134 |     db = redis.StrictRedis.from_url(options.redis)
135 |     # 扫描所有驱动
136 |     driver_name_to_title = {}
137 |     for driver_name, var in vars(drivers).items():
138 |         try:
139 |             if issubclass(var, drivers.ProxyDriver) \
140 |                     and var is not drivers.ProxyDriver \
141 |                     and hasattr(var, 'title'):
142 |                 driver_name_to_title[driver_name] = getattr(var, 'title')
143 |         except TypeError:
144 |             pass
145 |     if not driver_name_to_title:
146 |         return "ERR: 无可用驱动"
147 |     drivers_names = list(driver_name_to_title.items())
148 |     # 询问用户，选择驱动
149 |     print("请选择代理驱动 (填写序号或英文名称)")
150 |     print('\n'.join([f"{i}. {k}, {v}" for i, (k, v) in enumerate(drivers_names)]))
151 |     s = input('➜ ')
152 |     driver_name = driver_name_to_title.get(s) and s
153 |     if driver_name is None:
154 |         try:
155 |             driver_name = drivers_names[int(s)][0]
156 |         except (ValueError, KeyError, IndexError):
157 |             return "ERR: 序号或名称错误"
158 |     print("当前驱动为: ", driver_name)
159 |     driver_cls = getattr(drivers, driver_name)
160 |     # 询问配置
161 |     proxy_name = template_input([{
162 |         "name": "name",
163 |         "title": "请为当前配置设置独一无二的名称"
164 |     }])['name']
165 |     proxy_params = driver_cls.get_params()
166 |     # 检查配置名是否重复
167 |     if db.hexists("proxy:configs", proxy_name):
168 |         s = input(f"配置'{proxy_name}'已存在，是否覆盖 (Y/N) ")
169 |         if s.upper() != 'Y':
170 |             return 'Bye~'
171 |     # 写入配置
172 |     proxy_params['version'] = int(time.time())
173 |     proxy_params['driver'] = driver_cls.__name__
174 |     db.hset("proxy:configs", proxy_name, json.dumps(proxy_params))
175 |     return 'OK'
176 | 
177 | 
178 | def handler_proxy_remove(options):
179 |     """删除代理"""
180 |     db = redis.StrictRedis.from_url(options.redis)
181 | 
182 |     if options.name == 'all':
183 |         count = db.delete("proxy:configs", *db.keys("proxy:addresses:*"))
184 |     else:
185 |         count = db.hdel("proxy:configs", options.name)
186 |         count += db.delete(f"proxy:addresses:{options.name}")
187 |     if count:
188 |         return 'OK'
189 |     else:
190 |         return '没有代理'
191 | 
192 | 
193 | def handler_proxy_list(options):
194 |     """列出所有代理"""
195 |     db = redis.StrictRedis.from_url(options.redis)
196 | 
197 |     configs = db.hgetall("proxy:configs")
198 |     if not configs:
199 |         return "没有代理"
200 |     configs = {k.decode(): json.loads(v) for k, v in configs.items()}
201 |     data = [(k, v['driver']) for k, v in configs.items()]
202 |     headers = ['配置名', '驱动']
203 |     return tabulate(data, headers, 'presto', showindex='always')
204 | 
205 | 
206 | def handler_run(options):
207 |     """运行爬虫"""
208 |     db = redis.StrictRedis.from_url(options.redis)
209 |     spider_configs = load_spider_configs(options.spider)
210 | 
211 |     proxies = parse_args(options.proxy)
212 |     if proxies:
213 |         for proxy_name in proxies:
214 |             if not db.hexists("proxy:configs", proxy_name):
215 |                 return f"ERR: 未找到代理'{proxy_name}'"
216 |         logger.info("使用代理运行", proxies)
217 | 
218 |     if not os.path.exists(os.path.join(options.spider, '__init__.py')):
219 |         return "ERR: 未找到爬虫入口:'__init__.py'"
220 | 
221 |     if options.clear:
222 |         logger.info("清空队列与代理数据")
223 |         Spider(db, options.name).clear_proxy()
224 |         Spider(db, options.name).clear_queue()
225 | 
226 |     if options.damon:
227 |         pidfile = daemon.pidfile.PIDLockFile(os.path.join(options.log, f'{options.name}.pid'))
228 |         if pidfile.is_locked():
229 |             pid = pidfile.read_pid()
230 |             if psutil.pid_exists(pid):
231 |                 return f"已有实例正在运行，pid={pid}"
232 |             else:
233 |                 pidfile.break_lock()
234 |         logger.info("转入后台运行")
235 |         with daemon.DaemonContext(pidfile=pidfile, stderr=sys.stderr):
236 |             set_file_logger(options, options.name)
237 |             return worker.start(
238 |                 options.spider, options.redis, spider_configs,
239 |                 proxies, options.processes, options.threads
240 |             )
241 |     else:
242 |         return worker.start(
243 |             options.spider, options.redis, spider_configs,
244 |             proxies, options.processes, options.threads
245 |         )
246 | 
247 | 
248 | def handler_remove(options):
249 |     """清除数据"""
250 |     db = redis.StrictRedis(options.redis)
251 |     spider = Spider(db, options.name)
252 |     if options.target == 'queue':
253 |         count = spider.clear_queue()
254 |         return f"已清除{count}条队列数据"
255 |     elif options.target == 'proxy':
256 |         count = spider.clear_proxy()
257 |         if count:
258 |             return "已清除代理数据"
259 |         else:
260 |             return "没有代理数据"
261 |     else:
262 |         return f"无法清理:{options.target}"
263 | 
264 | 
265 | def handler_top(options):
266 |     """查看统计"""
267 |     db = redis.StrictRedis.from_url(options.redis)
268 |     tracking = tasks.Tracking(options.name, db)
269 |     lasts = {field: tracking.get(field) for field in sorted(tracking.fields)}
270 |     try:
271 |         while True:
272 |             print(f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', end=' ')
273 |             count = db.llen(f"proxy:addresses:{options.name}")
274 |             print(f'proxy:{count}', end='; ')
275 |             fields = sorted(tracking.fields)
276 |             for field in fields:
277 |                 last = lasts.get(field, None)
278 |                 current = tracking.get(field)
279 |                 lasts[field] = current
280 |                 if last is None:
281 |                     print(f"{field}:None", end='; ')
282 |                 else:
283 |                     print(f"{field}:{round((current-last)/options.interval, 1)}", end='; ')
284 |             print(end='\n')
285 |             time.sleep(options.interval)
286 |     except KeyboardInterrupt:
287 |         return
288 | 
289 | 
290 | def handler_tag_list(options):
291 |     """查看所有异常标签"""
292 |     db = redis.StrictRedis.from_url(options.redis)
293 |     if not Spider(db, options.name).exists():
294 |         return "爬虫不存在"
295 |     queue = Queue(db, options.name)
296 |     if options.tag:
297 |         data = [(d,) for d in queue.get_errors(options.tag, 0)]
298 |         return tabulate(data, ['URL'], 'presto', showindex='always')
299 |     else:
300 |         tags = queue.tags
301 |         if not tags:
302 |             return "没有标签"
303 |         else:
304 |             data = sorted(tags.items(), key=lambda t: t[1], reverse=True)
305 |             headers = ['标签', '数量']
306 |             return tabulate(data, headers, 'presto', showindex='always')
307 | 
308 | 
309 | def handler_tag_remove(options):
310 |     """移除异常标签"""
311 |     db = redis.StrictRedis.from_url(options.redis)
312 |     if not Spider(db, options.name).exists():
313 |         return "爬虫不存在"
314 |     queue = Queue(db, options.name)
315 |     if options.tags == 'all':
316 |         tags = queue.tags
317 |     else:
318 |         tags = parse_args(options.tags)
319 |     if not tags:
320 |         return '没有标签'
321 |     for tag in tags:
322 |         if queue.remove_tag(tag):
323 |             print(f"已删除标签'{tag}'")
324 |         else:
325 |             print(f"未找到标签'{tag}'")
326 |     return "OK"
327 | 
328 | 
329 | def handler_tag_rollback(options):
330 |     """回滚异常标签下的所有任务"""
331 |     db = redis.StrictRedis.from_url(options.redis)
332 |     if not Spider(db, options.name).exists():
333 |         return "爬虫不存在"
334 |     queue = Queue(db, options.name)
335 |     if options.tags == 'all':
336 |         tags = queue.tags
337 |     else:
338 |         tags = parse_args(options.tags)
339 |     if tags:
340 |         for tag in tags:
341 |             count = queue.rollback_tag(tag, 0)
342 |             return f"回滚'{tag}', 数量:{count}, 队列优先级:0"
343 |     else:
344 |         return "未指定标签"
345 | 
346 | 
347 | def main():
348 |     # parents
349 |     log = argparse.ArgumentParser(add_help=False)
350 |     log.add_argument('-l', '--level', default='info', help='日志级别')
351 |     log.add_argument('--log', help='存放日志文件的目录')
352 |     log.add_argument('--backup', type=int, default=3, help='日志文件保留数量')
353 |     spider = argparse.ArgumentParser(add_help=False)
354 |     spider.add_argument('-s', '--spider', default='./', help='指定爬虫目录')
355 |     db = argparse.ArgumentParser(add_help=False)
356 |     db.add_argument('-r', '--redis', default='redis://127.0.0.1:6379/0', help='指定redis地址')
357 | 
358 |     # pyloom
359 |     node = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
360 |     node.set_defaults(module=None)
361 |     node_modules = node.add_subparsers()
362 | 
363 |     # pyloom run
364 |     node_run = node_modules.add_parser('run', help='运行爬虫', parents=[log, spider, db])
365 |     node_run.set_defaults(module='run')
366 |     node_run.add_argument('-C', '--clear', action="store_true", help='清空爬虫数据后运行')
367 |     node_run.add_argument('--proxy', help='使用指定代理运行，逗号分隔多个代理')
368 |     node_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行')
369 |     node_run.add_argument('-p', '--processes', default=2, type=int, help='子进程数量')
370 |     node_run.add_argument('-t', '--threads', default=40, type=int, help='每个子进程的线程数量')
371 |     # pyloom stop
372 |     node_stop = node_modules.add_parser('stop', help='停止后台运行的爬虫', parents=[spider])
373 |     node_stop.set_defaults(module='stop')
374 |     # pyloom tail
375 |     node_tail = node_modules.add_parser('tail', help='查看日志文件', parents=[log, spider])
376 |     node_tail.set_defaults(module='tail')
377 |     # pyloom top
378 |     node_top = node_modules.add_parser('top', help='查看统计', parents=[spider, db])
379 |     node_top.set_defaults(module='top')
380 |     node_top.add_argument('-i', '--interval', default=10, type=int, help='抽样间隔')
381 |     # pyloom remove
382 |     node_remove = node_modules.add_parser('remove', help='清除爬虫数据')
383 |     node_remove.set_defaults(module='remove')
384 |     node_remove.set_defaults(target=None)
385 |     node_remove_targets = node_remove.add_subparsers()
386 |     # pyloom remove queue
387 |     node_remove_queue = node_remove_targets.add_parser('queue', help='清除队列数据', parents=[spider, db])
388 |     node_remove_queue.set_defaults(target='queue')
389 |     # pyloom remove proxy
390 |     node_remove_proxy = node_remove_targets.add_parser('proxy', help='清空代理池', parents=[spider, db])
391 |     node_remove_proxy.set_defaults(target='proxy')
392 |     # pyloom tag
393 |     node_tag = node_modules.add_parser('tag', help='标签管理')
394 |     node_tag.set_defaults(module='tag')
395 |     node_tag.set_defaults(command=None)
396 |     node_tag_commands = node_tag.add_subparsers()
397 |     # pyloom tag list
398 |     node_tag_list = node_tag_commands.add_parser('list', help='查看标签', parents=[spider, db])
399 |     node_tag_list.set_defaults(command='list')
400 |     node_tag_list.add_argument('tag', nargs='?', help='列出指定标签的内容，留空显示标签列表')
401 |     # pyloom tag remove
402 |     node_tag_remove = node_tag_commands.add_parser('remove', help='清除标签', parents=[spider, db])
403 |     node_tag_remove.set_defaults(command='remove')
404 |     node_tag_remove.add_argument('tags', help='被清除的标签，逗号分隔多个标签')
405 |     # pyloom rollback :tag
406 |     node_tag_rollback = node_tag_commands.add_parser('rollback', help='回滚标签', parents=[spider, db])
407 |     node_tag_rollback.set_defaults(command='rollback')
408 |     node_tag_rollback.add_argument('tags', help='被回滚的标签，逗号分隔多个标签')
409 | 
410 |     # pyloom proxy
411 |     node_proxy = node_modules.add_parser('proxy', help='代理节点')
412 |     node_proxy.set_defaults(module='proxy')
413 |     node_proxy.set_defaults(command=None)
414 |     node_proxy_commands = node_proxy.add_subparsers()
415 |     # pyloom proxy run
416 |     node_proxy_run = node_proxy_commands.add_parser('run', help='启动代理节点', parents=[log, db])
417 |     node_proxy_run.set_defaults(command='run')
418 |     node_proxy_run.add_argument('-d', '--damon', action="store_true", help='作为守护进程运行')
419 |     # pyloom proxy stop
420 |     node_proxy_stop = node_proxy_commands.add_parser('stop', help='停止节点', parents=[log])
421 |     node_proxy_stop.set_defaults(command='stop')
422 |     # pyloom proxy tail
423 |     node_proxy_tail = node_proxy_commands.add_parser('tail', help='查看日志', parents=[log])
424 |     node_proxy_tail.set_defaults(command='tail')
425 |     # pyloom proxy add
426 |     node_proxy_add = node_proxy_commands.add_parser('add', help='添加代理', parents=[db])
427 |     node_proxy_add.set_defaults(command='add')
428 |     # pyloom proxy remove
429 |     node_proxy_remove = node_proxy_commands.add_parser('remove', help='删除指定代理', parents=[db])
430 |     node_proxy_remove.set_defaults(command='remove')
431 |     node_proxy_remove.add_argument('name', help='欲删除的代理名称，all表示所有代理')
432 |     # pyloom proxy list
433 |     node_proxy_list = node_proxy_commands.add_parser('list', help='列出所有配置', parents=[db])
434 |     node_proxy_list.set_defaults(command='list')
435 | 
436 |     # 路由至对应模块
437 |     options = node.parse_args()
438 |     try:
439 |         set_defaults(options)
440 |         set_console_logger()
441 |         if options.module == 'proxy':
442 |             if options.command == 'run':
443 |                 return handler_proxy_run(options)
444 |             elif options.command == 'stop':
445 |                 return handler_common_stop(options, 'proxy.pid')
446 |             elif options.command == 'tail':
447 |                 return handler_common_tail(options, 'proxy')
448 |             elif options.command == 'add':
449 |                 return handler_proxy_add(options)
450 |             elif options.command == 'remove':
451 |                 return handler_proxy_remove(options)
452 |             elif options.command == 'list':
453 |                 return handler_proxy_list(options)
454 |             else:
455 |                 return node_proxy.print_help()
456 |         elif options.module == 'run':
457 |             return handler_run(options)
458 |         elif options.module == 'stop':
459 |             return handler_common_stop(options, f'{options.name}.pid')
460 |         elif options.module == 'remove':
461 |             return handler_remove(options)
462 |         elif options.module == 'top':
463 |             return handler_top(options)
464 |         elif options.module == 'tail':
465 |             return handler_common_tail(options, options.name)
466 |         elif options.module == 'tag':
467 |             if options.command == 'list':
468 |                 return handler_tag_list(options)
469 |             elif options.command == 'remove':
470 |                 return handler_tag_remove(options)
471 |             elif options.command == 'rollback':
472 |                 return handler_tag_rollback(options)
473 |             else:
474 |                 return node_tag.print_help()
475 |         else:
476 |             return node.print_help()
477 |     except ConfigFileNotFoundError as e:
478 |         return f'ERR: {str(e)}'
479 | 


--------------------------------------------------------------------------------
/pyloom/errors.py:
--------------------------------------------------------------------------------
 1 | class TaskError(Exception):
 2 |     """
 3 |     Task生命周期中出现的异常
 4 |     若某Task抛出此异常，当前URL将被加入异常队列（error）
 5 |     """
 6 | 
 7 |     def __init__(self, tag):
 8 |         self.err = f"TaskError('{tag}')"
 9 | 
10 |     def __str__(self):
11 |         return self.err
12 | 
13 | 
14 | class TaskFinish(Exception):
15 |     """提前结束生命周期，并将当前URL加入布隆过滤器"""
16 | 
17 | 
18 | class TaskBreak(Exception):
19 |     """提前结束生命周期，并将当前URL归还到任务队列"""
20 | 
21 |     def __init__(self, priority=0):
22 |         """
23 |         Args:
24 |             priority: 指定队列优先级
25 |         """
26 |         self.priority = priority
27 | 
28 | 
29 | class RetryExceeded(TaskError):
30 |     """重试次数超限"""
31 | 
32 |     def __init__(self):
33 |         self.err = "RetryExceeded"
34 | 
35 | 
36 | class RequestError(Exception):
37 |     """请求异常"""
38 | 
39 | 
40 | class Timeout(RequestError):
41 |     """请求超时"""
42 | 
43 | 
44 | class ProxyError(Exception):
45 |     """代理异常"""
46 | 
47 | 
48 | class RetryError(Exception):
49 |     """重试错误，需搭配tasks.retry装饰器使用"""
50 | 
51 | 
52 | class JSONDecodeError(ValueError):
53 |     """使用Response().json时出现解码错误"""
54 | 
55 | 
56 | class DebuggerError(Exception):
57 |     pass
58 | 
59 | 
60 | class SchedulerError(Exception):
61 |     pass
62 | 
63 | 
64 | class ConfigError(Exception):
65 |     def __init__(self, name, err=None):
66 |         self.name = name
67 |         self.err = err
68 | 
69 |     def __str__(self):
70 |         s = f"配置'{self.name}'有误"
71 |         if self.err is not None:
72 |             s += f", {self.err}"
73 |         return s
74 | 
75 | 
76 | class ConfigFileNotFoundError(ConfigError, FileNotFoundError):
77 |     def __init__(self, file):
78 |         self.file = file
79 | 
80 |     def __str__(self):
81 |         return f"未找到配置文件:'{self.file}'"
82 | 
83 | 
84 | class ConfigNotNone(ConfigError, ValueError):
85 |     def __init__(self, name):
86 |         self.name = name
87 | 
88 |     def __str__(self):
89 |         return f"缺少配置项:'{self.name}'"
90 | 
91 | 
92 | class BucketError(Exception):
93 |     pass
94 | 


--------------------------------------------------------------------------------
/pyloom/lua/bloom_cas.lua:
--------------------------------------------------------------------------------
  1 | --[[
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2017 Erik Dubbelboer
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | SOFTWARE.
 23 | ]]
 24 | 
 25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改
 26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter
 27 | 
 28 | local bloom_cas = function(name, entries, precision, value)
 29 |     local hash = redis.sha1hex(value)
 30 |     local prefix = "queue:" .. name .. ":filter:bloom"
 31 |     local countkey = prefix .. ':count'
 32 |     local count = redis.call('GET', countkey)
 33 |     if not count then
 34 |         count = 1
 35 |     else
 36 |         count = count + 1
 37 |     end
 38 | 
 39 |     local factor = math.ceil((entries + count) / entries)
 40 |     -- 0.69314718055995 = ln(2)
 41 |     local index = math.ceil(math.log(factor) / 0.69314718055995)
 42 |     local scale = math.pow(2, index - 1) * entries
 43 | 
 44 |     -- This uses a variation on:
 45 |     -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
 46 |     -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
 47 |     local h = {}
 48 |     h[0] = tonumber(string.sub(hash, 1, 8), 16)
 49 |     h[1] = tonumber(string.sub(hash, 9, 16), 16)
 50 |     h[2] = tonumber(string.sub(hash, 17, 24), 16)
 51 |     h[3] = tonumber(string.sub(hash, 25, 32), 16)
 52 | 
 53 |     -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
 54 |     -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
 55 |     -- 0.4804530139182 = ln(2)^2
 56 |     local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
 57 | 
 58 |     -- 0.69314718055995 = ln(2)
 59 |     local maxk = math.floor(0.69314718055995 * maxbits / scale)
 60 |     local b = {}
 61 | 
 62 |     for i = 1, maxk do
 63 |         table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
 64 |     end
 65 | 
 66 |     -- Only do this if we have data already.
 67 |     if index > 1 then
 68 |         -- The last fiter will be handled below.
 69 |         for n = 1, index - 1 do
 70 |             local key = prefix .. ':' .. n
 71 |             local scale = math.pow(2, n - 1) * entries
 72 | 
 73 |             -- 0.4804530139182 = ln(2)^2
 74 |             local bits = math.floor((scale * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
 75 | 
 76 |             -- 0.69314718055995 = ln(2)
 77 |             local k = math.floor(0.69314718055995 * bits / scale)
 78 | 
 79 |             local found = true
 80 |             for i = 1, k do
 81 |                 if redis.call('GETBIT', key, b[i] % bits) == 0 then
 82 |                     found = false
 83 |                     break
 84 |                 end
 85 |             end
 86 | 
 87 |             if found then
 88 |                 return 1
 89 |             end
 90 |         end
 91 |     end
 92 | 
 93 |     -- For the last filter we do a SETBIT where we check the result value.
 94 |     local key = prefix .. ':' .. index
 95 | 
 96 |     local found = 1
 97 |     for i = 1, maxk do
 98 |         if redis.call('SETBIT', key, b[i] % maxbits, 1) == 0 then
 99 |             found = 0
100 |         end
101 |     end
102 | 
103 |     if found == 0 then
104 |         -- INCR is a little bit faster than SET.
105 |         redis.call('INCR', countkey)
106 |     end
107 | 
108 |     return found
109 | end
110 | 
111 | -- 从爬虫配置读取布隆参数
112 | local name = KEYS[1]
113 | local key_spider = "spider:" .. name
114 | local precision = redis.call('HGET', key_spider, 'precision')
115 | if not precision then
116 |     return { err = "爬虫未配置'precision'" }
117 | end
118 | 
119 | return bloom_cas(name, 1000000, precision, ARGV[1])
120 | 


--------------------------------------------------------------------------------
/pyloom/lua/bloom_check.lua:
--------------------------------------------------------------------------------
 1 | --[[
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2017 Erik Dubbelboer
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | ]]
24 | 
25 | -- 在原作者Erik Dubbelboer的成果上做了简单修改
26 | -- https://github.com/erikdubbelboer/redis-lua-scaling-bloom-filter
27 | 
28 | local bloom_check = function(name, entries, precision, value)
29 |     local prefix = "queue:" .. name .. ":filter:bloom"
30 |     local count = redis.call('GET', prefix .. ':count')
31 |     if not count then
32 |         return 0
33 |     end
34 | 
35 |     local factor = math.ceil((entries + count) / entries)
36 |     -- 0.69314718055995 = ln(2)
37 |     local index = math.ceil(math.log(factor) / 0.69314718055995)
38 |     local scale = math.pow(2, index - 1) * entries
39 |     local hash = redis.sha1hex(value)
40 | 
41 |     -- This uses a variation on:
42 |     -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
43 |     -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
44 |     local h = {}
45 |     h[0] = tonumber(string.sub(hash, 1, 8), 16)
46 |     h[1] = tonumber(string.sub(hash, 9, 16), 16)
47 |     h[2] = tonumber(string.sub(hash, 17, 24), 16)
48 |     h[3] = tonumber(string.sub(hash, 25, 32), 16)
49 | 
50 |     -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
51 |     -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
52 |     -- 0.4804530139182 = ln(2)^2
53 |     local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
54 |     -- 0.69314718055995 = ln(2)
55 |     local maxk = math.floor(0.69314718055995 * maxbits / scale)
56 |     local b = {}
57 | 
58 |     for i = 1, maxk do
59 |         table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
60 |     end
61 | 
62 |     for n = 1, index do
63 |         local key = prefix .. ':' .. n
64 |         local found = true
65 |         local scalen = math.pow(2, n - 1) * entries
66 | 
67 |         -- 0.4804530139182 = ln(2)^2
68 |         local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
69 | 
70 |         -- 0.69314718055995 = ln(2)
71 |         local k = math.floor(0.69314718055995 * bits / scalen)
72 | 
73 |         for i = 1, k do
74 |             if redis.call('GETBIT', key, b[i] % bits) == 0 then
75 |                 found = false
76 |                 break
77 |             end
78 |         end
79 | 
80 |         if found then
81 |             return 1
82 |         end
83 |     end
84 | 
85 |     return 0
86 | end
87 | 
88 | -- 从爬虫配置中读取布隆参数
89 | local name = KEYS[1]
90 | local key_spider = "spider:" .. name
91 | local precision = redis.call('HGET', key_spider, 'precision')
92 | if not precision then
93 |     return { err = "爬虫未配置'precision'" }
94 | end
95 | 
96 | return bloom_check(name, 1000000, precision, ARGV[1])
97 | 


--------------------------------------------------------------------------------
/pyloom/lua/url_add.lua:
--------------------------------------------------------------------------------
  1 | -- 将一组URLs排重后添加至队列
  2 | -- Keys: name priority
  3 | -- Argv: url [url ...]
  4 | -- Return: count，成功数量（不重复数量）
  5 | 
  6 | local bloom_check = function(name, entries, precision, value)
  7 |     local prefix = "queue:" .. name .. ":filter:bloom"
  8 |     local count = redis.call('GET', prefix .. ':count')
  9 |     if not count then
 10 |         return 0
 11 |     end
 12 | 
 13 |     local factor = math.ceil((entries + count) / entries)
 14 |     -- 0.69314718055995 = ln(2)
 15 |     local index = math.ceil(math.log(factor) / 0.69314718055995)
 16 |     local scale = math.pow(2, index - 1) * entries
 17 |     local hash = redis.sha1hex(value)
 18 | 
 19 |     -- This uses a variation on:
 20 |     -- 'Less Hashing, Same Performance: Building a Better Bloom Filter'
 21 |     -- https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
 22 |     local h = {}
 23 |     h[0] = tonumber(string.sub(hash, 1, 8), 16)
 24 |     h[1] = tonumber(string.sub(hash, 9, 16), 16)
 25 |     h[2] = tonumber(string.sub(hash, 17, 24), 16)
 26 |     h[3] = tonumber(string.sub(hash, 25, 32), 16)
 27 | 
 28 |     -- Based on the math from: http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
 29 |     -- Combined with: http://www.sciencedirect.com/science/article/pii/S0020019006003127
 30 |     -- 0.4804530139182 = ln(2)^2
 31 |     local maxbits = math.floor((scale * math.log(precision * math.pow(0.5, index))) / -0.4804530139182)
 32 |     -- 0.69314718055995 = ln(2)
 33 |     local maxk = math.floor(0.69314718055995 * maxbits / scale)
 34 |     local b = {}
 35 | 
 36 |     for i = 1, maxk do
 37 |         table.insert(b, h[i % 2] + i * h[2 + (((i + (i % 2)) % 4) / 2)])
 38 |     end
 39 | 
 40 |     for n = 1, index do
 41 |         local key = prefix .. ':' .. n
 42 |         local found = true
 43 |         local scalen = math.pow(2, n - 1) * entries
 44 | 
 45 |         -- 0.4804530139182 = ln(2)^2
 46 |         local bits = math.floor((scalen * math.log(precision * math.pow(0.5, n))) / -0.4804530139182)
 47 | 
 48 |         -- 0.69314718055995 = ln(2)
 49 |         local k = math.floor(0.69314718055995 * bits / scalen)
 50 | 
 51 |         for i = 1, k do
 52 |             if redis.call('GETBIT', key, b[i] % bits) == 0 then
 53 |                 found = false
 54 |                 break
 55 |             end
 56 |         end
 57 | 
 58 |         if found then
 59 |             return 1
 60 |         end
 61 |     end
 62 | 
 63 |     return 0
 64 | end
 65 | 
 66 | local name = KEYS[1]
 67 | local priority = KEYS[2]
 68 | local key_spider = "spider:" .. name
 69 | local key_waiting = "queue:" .. name .. ":waiting:" .. priority
 70 | local key_filter_queue = "queue:" .. name .. ":filter:queue"
 71 | local filter = {} -- 对ARGV排重
 72 | local urls = {} -- 将被批量添加到waiting队列的URL
 73 | 
 74 | -- 从爬虫配置中读取布隆参数
 75 | local precision = redis.call('HGET', key_spider, 'precision')
 76 | if not precision then
 77 |     return { err = "爬虫未配置'precision'" }
 78 | end
 79 | 
 80 | -- 排重
 81 | for i = 1, #ARGV do
 82 |     local url = ARGV[i]
 83 |     local exists = filter[url] or
 84 |             bloom_check(name, 1000000, precision, url) == 1 or
 85 |             redis.call('SISMEMBER', key_filter_queue, url) == 1
 86 |     if not exists then
 87 |         filter[url] = true
 88 |         table.insert(urls, url)
 89 |     end
 90 | end
 91 | 
 92 | -- 加入队列
 93 | if #urls == 0 then
 94 |     return 0
 95 | else
 96 |     redis.call('LPUSH', key_waiting, unpack(urls))
 97 |     redis.call('SADD', key_filter_queue, unpack(urls))
 98 |     return #urls
 99 | end
100 | 


--------------------------------------------------------------------------------
/pyloom/lua/url_pop.lua:
--------------------------------------------------------------------------------
 1 | -- 请求分配任务
 2 | -- Keys: now
 3 | -- Argv: name [name ...]
 4 | -- Return: (url, name, address)
 5 | local now = tonumber(KEYS[1])
 6 | for i = 1, #ARGV do
 7 |     local name = ARGV[i] -- 爬虫名
 8 |     local key_spider = "spider:" .. name
 9 |     local status = tonumber(redis.call("HGET", key_spider, "status"))
10 |     redis.call("HSET", key_spider, "last_heartbeat_time", now)
11 |     -- 条件: 爬虫状态至少为就绪态
12 |     if status >= 10 then
13 |         local interval = cjson.decode(redis.call("HGET", key_spider, "interval"))
14 |         local last_pop_time = cjson.decode(redis.call("HGET", key_spider, "last_pop_time"))
15 |         -- 条件: 爬虫已到可用时间
16 |         if now >= last_pop_time + interval then
17 |             local proxies = cjson.decode(redis.call("HGET", key_spider, "proxies"))
18 |             local address = false
19 |             -- 爬虫被设置了代理，把代理池弹空也要弹出一个可用代理
20 |             if #proxies ~= 0 then
21 |                 local recycle = {}
22 |                 while not address do
23 |                     address = redis.call("RPOP", "proxy:addresses:" .. name)
24 |                     -- 代理池空了，不再继续弹出
25 |                     if not address then
26 |                         break
27 |                     end
28 |                     local t1 = string.find(address, ":", 1)
29 |                     local t2 = string.find(address, ":", t1 + 1)
30 |                     local valid_at = tonumber(string.sub(address, 1, t1 - 1))
31 |                     local expire_at = tonumber(string.sub(address, t1 + 1, t2 - 1))
32 | 
33 |                     if valid_at > now then
34 |                         -- 代理未到可用时间，放回去 -> 重新弹出
35 |                         table.insert(recycle, address)
36 |                         address = false
37 |                     else
38 |                         -- 代理已到可用时间，并未过期 -> 已拿到代理！
39 |                         if expire_at > now then
40 |                             break
41 |                         end
42 |                         -- 代理已到可用时间，已过期 -> 重新弹出
43 |                         -- 过期代理不归还
44 |                     end
45 |                 end
46 |                 if #recycle ~= 0 then
47 |                     redis.call("LPUSH", "proxy:addresses:" .. name, unpack(recycle))
48 |                 end
49 |             end
50 |             -- 条件: 爬虫未设置代理或代理池中可用代理不全为空
51 |             if #proxies == 0 or address then
52 |                 local key_processing = "queue:" .. name .. ":processing"
53 |                 for priority = 0, 4 do
54 |                     -- 条件: 爬虫名下所有队列不全为空
55 |                     local key_waiting = "queue:" .. name .. ":waiting:" .. priority
56 |                     local url = redis.call("RPOP", key_waiting)
57 |                     -- 已满足所有条件，发布任务
58 |                     if url then
59 |                         -- 加入processing
60 |                         redis.call("HSET", key_processing, url, now)
61 |                         -- 更新last_pop_time
62 |                         redis.call("HSET", key_spider, "last_pop_time", now)
63 |                         return { url, name, address }
64 |                     end
65 |                 end
66 |                 -- 队列全部为空时，设置爬虫状态为已完成
67 |                 local processing_len = redis.call("HLEN", key_processing)
68 |                 if processing_len == 0 then
69 |                     redis.call("HSET", key_spider, "status", 0)
70 |                 end
71 |             end
72 |         end
73 |     end
74 | end
75 | 
76 | return { false, false, false }
77 | 


--------------------------------------------------------------------------------
/pyloom/proxy.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import redis
  3 | import traceback
  4 | from .utils import *
  5 | from .errors import *
  6 | from . import drivers
  7 | from .scheduler import Spider
  8 | from threading import Thread, Lock
  9 | 
 10 | logger = logging.getLogger("proxy")
 11 | 
 12 | 
 13 | def proxy_handler(redis_conf, names, name, router, router_lock, driver, **params):
 14 |     """
 15 |     从驱动获取代理，推送至指定爬虫的代理池中
 16 |     每只爬虫有一个代理池，键名为'proxy:proxies:<name>'，list型
 17 |         [address1, address2, address3...]
 18 |     其中address为str型，结构为：'valid_at:expire_at:address'
 19 |         valid_at: 代理生效时间
 20 |         expire_at: 代理失效时间
 21 |         address: 代理地址
 22 |     address在valid_at < now < expire_at可用，并在now > expire_at时被删除
 23 |     Args:
 24 |         redis_conf: Redis配置
 25 |         names: 所有存活代理
 26 |         name: 当前代理名，通过name in names判断当前代理是否存活
 27 |         router: 路由表，获取代理后，根据路由推送至代理池
 28 |         router_lock: router的锁
 29 |         driver: 驱动类
 30 |         params: 驱动参数
 31 |     """
 32 |     db = redis.StrictRedis.from_url(redis_conf)
 33 |     gen = driver(**params).gen_addresses()
 34 |     for is_ok, result in gen:
 35 |         with router_lock:
 36 |             targets = router.get(name, [])
 37 |         logger.debug("代理正在运行", name, targets)
 38 |         if not targets:
 39 |             logger.info("代理退出，router中没有记录", name, driver)
 40 |             break
 41 |         if name not in names:
 42 |             logger.info("代理退出，names中没有记录", name, driver)
 43 |             break
 44 |         if is_ok:
 45 |             for target in targets:
 46 |                 length = db.lpush(f"proxy:addresses:{target}", *result)
 47 |                 logger.info(f"添加代理, 代理:{name}，目标:{target}, 新增数量:{len(result)}, 当前数量:{length}\n", result)
 48 |             else:
 49 |                 time.sleep(1)
 50 |         else:
 51 |             logger.warning("代理出现异常", name, result)
 52 | 
 53 | 
 54 | def get_driver(driver_name):
 55 |     """获取并检查驱动是否正确"""
 56 |     if not hasattr(drivers, driver_name):
 57 |         raise ProxyError("未找到驱动", driver_name)
 58 |     driver = getattr(drivers, driver_name)
 59 |     if not issubclass(driver, drivers.ProxyDriver):
 60 |         raise ProxyError("驱动应继承自ProxyDriver", driver_name)
 61 |     if not hasattr(driver, 'title'):
 62 |         raise ProxyError("驱动缺少属性", f"{driver_name}.title")
 63 |     return driver
 64 | 
 65 | 
 66 | def start(redis_conf):
 67 |     """
 68 |     根据代理配置，维护代理线程池
 69 |     代理配置为一个redis dict键，键名为proxy:configs，结构为：
 70 |     {
 71 |         proxy_name: {
 72 |             version: str, // 版本号，版本号变化时，代理线程将会重启
 73 |             driver: str, // 驱动名，对应proxy.py中的类
 74 |             **params // 驱动参数，将被传递给驱动
 75 |         }
 76 |     }
 77 |     """
 78 |     logger.info("代理池已启动")
 79 |     db = redis.StrictRedis.from_url(redis_conf)
 80 |     threads = {}  # 配置表，{proxy_name: {'version': int, 'thread': Thread}}
 81 |     router = {}  # 路由表，{proxy_name: set([spider_name])}
 82 |     router_lock = Lock()
 83 |     for i in itertools.count():
 84 |         try:
 85 |             time.sleep(3 if i else 0)
 86 |             # 更新路由表，指示代理线程拿到代理后要推给哪些爬虫
 87 |             _router = {}
 88 |             for spider_name in Spider.names(db):
 89 |                 spider = Spider(db, spider_name)
 90 |                 if spider.get_field("status") < 10:
 91 |                     logger.debug("忽略未就绪爬虫", spider_name)
 92 |                     continue
 93 |                 last_heartbeat_time = spider.get_field("last_heartbeat_time")
 94 |                 if time.time() - last_heartbeat_time > 300:
 95 |                     logger.debug("忽略长久未运行的爬虫", spider_name)
 96 |                     continue
 97 |                 proxies = Spider(db, spider_name).get_field("proxies")
 98 |                 if not proxies:
 99 |                     logger.debug("忽略未配置代理的爬虫", spider_name)
100 |                     continue
101 |                 for proxy_name in proxies:
102 |                     _router.setdefault(proxy_name, set()).add(spider_name)
103 |             with router_lock:
104 |                 router.clear()
105 |                 router.update(_router)
106 | 
107 |             # 标记失效线程
108 |             configs = {
109 |                 k.decode(): json.loads(v) for k, v in db.hgetall("proxy:configs").items()
110 |             }
111 |             logger.debug("代理配置", configs)
112 |             marked_threads = {}  # 被标记退出的线程，结构同threads
113 |             for proxy_name, fields in threads.items():
114 |                 if proxy_name not in configs:
115 |                     logger.info("标记配置被删的代理", proxy_name)
116 |                     marked_threads[proxy_name] = fields
117 |                     continue
118 |                 if fields['version'] != configs[proxy_name]['version']:
119 |                     logger.info("标记配置更新的代理", proxy_name)
120 |                     marked_threads[proxy_name] = fields
121 |                     continue
122 |                 if proxy_name not in router:
123 |                     logger.info("标记已无爬虫的代理", proxy_name)
124 |                     marked_threads[proxy_name] = fields
125 |                     continue
126 |                 if not fields['thread'].is_alive():
127 |                     logger.info("标记异常退出的代理", proxy_name)
128 |                     marked_threads[proxy_name] = fields
129 |                     continue
130 | 
131 |             # 销毁被标记的线程
132 |             # 线程看见自己没在threads中时会终止
133 |             if marked_threads:
134 |                 for proxy_name in marked_threads.keys():
135 |                     del threads[proxy_name]
136 |                     with router_lock:
137 |                         if proxy_name in router:
138 |                             del router[proxy_name]
139 |                 logger.info("等待被标记代理线程退出", list(marked_threads.keys()))
140 |                 for _ in range(300):
141 |                     alive = any([t['thread'].is_alive() for t in marked_threads.values()])
142 |                     if not alive:
143 |                         break
144 |                     time.sleep(1)
145 |                 else:
146 |                     logger.error("被标记代理线程超时仍未退出")
147 |                     threads.update(marked_threads)
148 |                     time.sleep(3)
149 |                     continue
150 |                 logger.info("被标记代理线程已全部退出")
151 |             # 启动新线程
152 |             proxy_names_new = set(configs.keys()) - set(threads.keys())
153 |             if proxy_names_new:
154 |                 for proxy_name in proxy_names_new:
155 |                     targets = router.get(proxy_name, [])
156 |                     if not targets:
157 |                         logger.debug("代理名下没有爬虫，暂不启动", proxy_name)
158 |                         continue
159 |                     logger.info("启动代理线程", proxy_name)
160 |                     version = configs[proxy_name].pop('version')
161 |                     driver = get_driver(configs[proxy_name].pop('driver'))
162 |                     t = Thread(
163 |                         target=proxy_handler,
164 |                         args=(
165 |                             redis_conf, threads, proxy_name, router, router_lock, driver
166 |                         ),
167 |                         kwargs=(configs[proxy_name]),
168 |                         daemon=True
169 |                     )
170 |                     threads[proxy_name] = {
171 |                         'version': version,
172 |                         'thread': t
173 |                     }
174 |                     t.start()
175 |         except KeyboardInterrupt:
176 |             logger.info("收到Ctrl+C", 'proxy')
177 |             break
178 |         except Exception as e:
179 |             logger.fatal("未处理的异常", type(e), e, '\n', traceback.format_exc())
180 | 


--------------------------------------------------------------------------------
/pyloom/scheduler.py:
--------------------------------------------------------------------------------
  1 | """Scheduler SDK"""
  2 | import json
  3 | import time
  4 | import copy
  5 | import random
  6 | import logging
  7 | from .errors import *
  8 | from redis import StrictRedis
  9 | from . import utils, tasks, buckets
 10 | 
 11 | key_spiders = "spiders"  # set
 12 | logger = logging.getLogger("scheduler")
 13 | 
 14 | 
 15 | class Spider(object):
 16 |     prefix = "spider"
 17 |     _caches = {}
 18 |     _timeout = 10
 19 |     status = {
 20 |         0: '已完成',
 21 |         10: '就绪',
 22 |         20: '等待代理',  # 暂未实现
 23 |         21: '等待时间',  # 暂未实现
 24 |         -1: '异常关闭',
 25 |         -2: '主动关闭'
 26 |     }
 27 | 
 28 |     def __init__(self, db: StrictRedis, name):
 29 |         self.name = name  # 爬虫名
 30 |         self._db = db
 31 |         self.key = f"{self.prefix}:{self.name}"  # 主键
 32 |         self.fields = {  # 爬虫所有字段及可缓存时间
 33 |             "interval": 300,
 34 |             "timeout": 300,
 35 |             "precision": 10000,
 36 |             "args": 300,
 37 |             "last_pop_time": 1,
 38 |             "status": 1,
 39 |             "version": 1,
 40 |             "proxies": 1,
 41 |             "last_heartbeat_time": 6
 42 |         }
 43 | 
 44 |     def exists(self):
 45 |         """爬虫是否存在"""
 46 |         return self._db.exists(self.key)
 47 | 
 48 |     def upsert(self, seeders, interval, timeout, precision, args, proxies, version):
 49 |         """
 50 |         新建爬虫或覆盖同名爬虫的配置（仅当版本号更大时）
 51 |         Args:
 52 |             seeders: 种子页面
 53 |             interval: 最小调度间隔（误差由pop频率决定）
 54 |             timeout: 任务超时时间
 55 |             precision: 布隆过滤器精度
 56 |             args: 自定义爬虫参数
 57 |             proxies: 使用代理运行
 58 |             version: 配置版本，等于目录的sha1值
 59 |         Returns:
 60 |             T/F: 是否更新了配置
 61 |         """
 62 |         # 当前版本比数据库的还小就不更新了
 63 |         _version = self._get_field("version")
 64 |         if _version is not None and version <= _version:
 65 |             return False
 66 |         # 忽略更新precision字段
 67 |         _precision = self._get_field("precision")
 68 |         if _precision is not None:
 69 |             precision = _precision
 70 |         # 爬虫配置
 71 |         values = {
 72 |             "interval": interval,
 73 |             "timeout": timeout,
 74 |             "precision": precision,
 75 |             "args": args,
 76 |             "last_pop_time": 0,
 77 |             "status": 10,  # 0:已完成，10:就绪，20:等待代理，21:等待时间，-1:异常关闭，-2:主动关闭
 78 |             "version": version,
 79 |             "proxies": proxies,  # 代理配置
 80 |             "last_heartbeat_time": 0,  # 最后一次尝试申请任务的时间
 81 |         }
 82 |         self._db.hmset(self.key, {k: json.dumps(v) for k, v in values.items()})
 83 |         self._db.sadd(key_spiders, self.name)
 84 |         # 将种子URL入队
 85 |         queues = Queue(self._db, self.name)
 86 |         queues.add(seeders, 0)
 87 |         return True
 88 | 
 89 |     def _get_field(self, field):
 90 |         """从数据库中查询并返回爬虫的配置项"""
 91 |         if field not in self.fields:
 92 |             raise SchedulerError(f"没有此配置项'{field}'")
 93 | 
 94 |         res = self._db.hget(self.key, field)
 95 |         if res is None:
 96 |             return None
 97 |         else:
 98 |             return json.loads(res)
 99 | 
100 |     def get_field(self, field):
101 |         """依此从缓存、数据库中查询并返回爬虫的配置项"""
102 |         if field not in self.fields:
103 |             raise SchedulerError(f"没有此配置项'{field}'")
104 | 
105 |         timeout = self.fields[field]
106 |         # 先尝试从缓存取值
107 |         cache_key = f"{self.name}:{field}"
108 |         var, start = Spider._caches.get(cache_key, (None, 0))
109 |         if start + timeout < time.time():
110 |             # 缓存过期或无缓存
111 |             var = self._get_field(field)
112 |             if timeout > 0:
113 |                 Spider._caches[cache_key] = (var, time.time())
114 |         return var
115 | 
116 |     def set_field(self, field, value):
117 |         """覆写爬虫的配置项"""
118 |         if field not in self.fields:
119 |             raise SchedulerError(f"没有此配置项'{field}'")
120 |         if field == 'precision':
121 |             raise SchedulerError(f"配置项被锁定'{field}'")
122 | 
123 |         self._db.hset(self.key, field, json.dumps(value))
124 |         # 设置缓存
125 |         cache_key = f"{self.name}:{field}"
126 |         timeout = self.fields[field]
127 |         if timeout > 0:
128 |             Spider._caches[cache_key] = (value, time.time())
129 | 
130 |     @classmethod
131 |     def names(cls, db: StrictRedis):
132 |         """返回所有爬虫名称的列表"""
133 |         return [r.decode() for r in db.smembers(key_spiders)]
134 | 
135 |     def clear_queue(self):
136 |         """清除该爬虫在队列中留存的数据"""
137 |         keys = []
138 |         keys += self._db.keys(f"{Spider.prefix}:{self.name}")
139 |         keys += self._db.keys(f"{Queue.prefix}:{self.name}:*")
140 |         keys += self._db.keys(f"{buckets.ShareBucket.prefix}:{self.name}:*")
141 |         keys += self._db.keys(f"{tasks.Tracking.prefix}:{self.name}:*")
142 |         return self._db.delete(*keys) if keys else 0
143 | 
144 |     def clear_proxy(self):
145 |         """清空该爬虫的代理池"""
146 |         count = self._db.delete(f"proxy:addresses:{self.name}")
147 |         count += self._db.srem(key_spiders, self.name)
148 |         return count
149 | 
150 | 
151 | class Queue(object):
152 |     prefix = "queue"
153 | 
154 |     def __init__(self, db: StrictRedis, name):
155 |         self.name = name  # 爬虫名
156 |         self._db = db
157 |         self._spider = Spider(db, name)
158 |         # 等待队列（list），5个优先级分别用5个list实现，左进右出
159 |         # [[url0, url1], [url0, url1], [url0, url1]]
160 |         self.key_waiting = [f"{self.prefix}:{self.name}:waiting:{i}" for i in range(5)]
161 |         # 进行队列（hash），field=url, value=timestamp
162 |         self.key_processing = f"{self.prefix}:{self.name}:processing"
163 |         # 异常标签（set）
164 |         self.key_tags = f"{self.prefix}:{self.name}:tags"
165 |         # 异常队列（list）
166 |         self.prefix_error = f"{self.prefix}:{self.name}:errors"  # :{tag}
167 |         # 队列过滤器（set），过滤waiting、processing、errors中的URl
168 |         self.key_filter_queue = f"{self.prefix}:{self.name}:filter:queue"
169 |         # 结果过滤器（string or set），过滤已抓取完成的URL
170 |         # 结果过滤器有两种实现：set、bloom，通过爬虫配置项'queue.filter'选择适合的实现
171 |         self.key_filter_bloom_count = f"{self.prefix}:{self.name}:filter:bloom:count"
172 | 
173 |     def exists(self, url):
174 |         """
175 |         URL是否存在
176 |         Returns:
177 |             0: 不存在
178 |             1: 存在于bloom中
179 |             2: 存在于queue中
180 |         """
181 |         # 在results中找
182 |         sha = utils.RedisScripts.sha1('bloom_check')
183 |         if self._db.evalsha(sha, 1, self.name, url):
184 |             return 1
185 |         # 在queue中找
186 |         if self._db.sismember(self.key_filter_queue, url):
187 |             return 2
188 |         else:
189 |             return 0
190 | 
191 |     def insert(self, url, priority):
192 |         """忽略布隆检查，将URL插入至队列中"""
193 |         self._db.lpush(self.key_waiting[priority], url)
194 |         self._db.sadd(self.key_filter_queue, url)
195 |         self._db.hdel(self.key_processing, url)
196 | 
197 |     def add(self, urls, priority):
198 |         """
199 |         URL批量入队
200 |         当URL相同，但priority不同时，也视为重复
201 |         Returns: 经排重后添加至队列的数量
202 |         """
203 |         if not isinstance(priority, int):
204 |             raise SchedulerError("priority应为int型")
205 |         if priority < 0 or priority >= len(self.key_waiting):
206 |             raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}")
207 | 
208 |         urls = list(set(urls))
209 |         sha = utils.RedisScripts.sha1('url_add')
210 |         return self._db.evalsha(sha, 2, self.name, priority, *urls)
211 | 
212 |     @classmethod
213 |     def pop(cls, db: StrictRedis, names):
214 |         """
215 |         从指定爬虫中弹出一条最合适的URL
216 |         Returns: (url, name)
217 |             当所有队列为空时，url == name == None
218 |         """
219 |         # 随机挑选爬虫
220 |         names = copy.deepcopy(names)
221 |         random.shuffle(names)
222 | 
223 |         sha = utils.RedisScripts.sha1('url_pop')
224 |         url, name, address = db.evalsha(sha, 1, time.time(), *names)
225 |         if url and name:
226 |             return [url.decode(), name.decode(), address.decode() if address else None]
227 |         else:
228 |             return [None, None, None]
229 | 
230 |     @classmethod
231 |     def purge(cls, db: StrictRedis):
232 |         """
233 |         清理processing中过期的URL，返回被清理数量
234 |         被清理的URL，将被打上"timeout"标签，移入error队列
235 |         """
236 |         count = 0
237 |         for name in Spider.names(db):
238 |             key = f"{cls.prefix}:{name}:processing"
239 |             queue = cls(db, name)
240 |             timeout = Spider(db, name).get_field("timeout")
241 |             # redis的scan是可能重复返回同一元素的
242 |             for url, _start in db.hscan_iter(key):
243 |                 if time.time() > float(_start) + timeout:  # 过期
244 |                     count += queue.report_error("timeout", url)
245 |         return count
246 | 
247 |     def report_finish(self, url):
248 |         """标记URL为已完成状态"""
249 |         if not self._db.hdel(self.key_processing, url):
250 |             return False
251 |         self._db.srem(self.key_filter_queue, url)
252 |         sha = utils.RedisScripts.sha1('bloom_cas')
253 |         logger.debug("report_finish", self.name, url)
254 |         return self._db.evalsha(sha, 1, self.name, url)
255 | 
256 |     def report_error(self, tag, url):
257 |         """标记URL为异常状态"""
258 |         if not self._db.hdel(self.key_processing, url):
259 |             return False
260 |         self._db.sadd(self.key_tags, tag)
261 |         return self._db.lpush(f"{self.prefix_error}:{tag}", url)
262 | 
263 |     @property
264 |     def tags(self):
265 |         """获取标签列表"""
266 |         return {
267 |             r.decode(): self._db.llen(f"{self.prefix_error}:{r.decode()}")
268 |             for r in self._db.smembers(self.key_tags)
269 |         }
270 | 
271 |     def get_errors(self, tag, count=0):
272 |         """获取指定标签下的所有异常URL"""
273 |         key = f"{self.prefix_error}:{tag}"
274 |         return [r.decode() for r in self._db.lrange(key, 0, count - 1)]
275 | 
276 |     def remove_tag(self, tag):
277 |         key = f"{self.prefix_error}:{tag}"
278 |         self._db.srem(self.key_tags, tag)
279 |         return self._db.delete(key)
280 | 
281 |     def rollback_tag(self, tag, priority):
282 |         """
283 |         将指定标签下的异常URL移至waiting队列中
284 |         返回回滚的URL数量
285 |         """
286 |         if not isinstance(priority, int):
287 |             raise SchedulerError("priority应为int型")
288 |         if priority < 0 or priority >= len(self.key_waiting):
289 |             raise SchedulerError(f"priority可选范围为:{list(range(len(self.key_waiting)))}")
290 |         key_errors = f"{self.prefix_error}:{tag}"
291 |         # 取出并删除异常URL、标签
292 |         pipe = self._db.pipeline()
293 |         pipe.lrange(key_errors, 0, -1)  # 取出所有
294 |         pipe.delete(key_errors)  # 删除队列
295 |         pipe.srem(self.key_tags, tag)  # 删除标签
296 |         res = pipe.execute()
297 |         # 添加至waiting
298 |         urls = res[0]
299 |         if urls:
300 |             self._db.lpush(self.key_waiting[priority], *urls)
301 |         return len(urls)
302 | 
303 |     @property
304 |     def details(self):
305 |         """队列信息"""
306 |         return {
307 |             'waiting': [self._db.llen(key) for key in self.key_waiting],
308 |             'processing': self._db.hlen(self.key_processing),
309 |             'results': int(self._db.get(self.key_filter_bloom_count) or 0),
310 |             'errors': sum([self._db.llen(f"{self.prefix_error}:{tag}") for tag in self.tags])
311 |         }
312 | 


--------------------------------------------------------------------------------
/pyloom/tasks.py:
--------------------------------------------------------------------------------
  1 | import furl
  2 | import json
  3 | import redis
  4 | import random
  5 | import requests
  6 | import traceback
  7 | import simplejson.errors
  8 | from .utils import *
  9 | from .errors import *
 10 | from lxml import etree
 11 | from typing import List
 12 | from typing import Union
 13 | from . import scheduler, errors
 14 | from bs4 import BeautifulSoup, element
 15 | from .buckets import LocalBucket, ShareBucket
 16 | 
 17 | logger = logging.getLogger("tasks")
 18 | 
 19 | 
 20 | class Queue(object):
 21 |     """队列控制器"""
 22 | 
 23 |     def __init__(self, db, name, url):
 24 |         self._spider = scheduler.Spider(db, name)
 25 |         self._queue = scheduler.Queue(db, name)
 26 |         self.url = url
 27 | 
 28 |     @property
 29 |     def detail(self):
 30 |         return self._queue.details
 31 | 
 32 |     @property
 33 |     def timeout(self):
 34 |         return self._spider.get_field("timeout")
 35 | 
 36 |     @timeout.setter
 37 |     def timeout(self, value):
 38 |         if not isinstance(value, (int, float)):
 39 |             raise errors.TaskError("timeout应为int或float型")
 40 |         self._spider.set_field("timeout", value)
 41 | 
 42 |     @property
 43 |     def interval(self):
 44 |         return self._spider.get_field("interval")
 45 | 
 46 |     @interval.setter
 47 |     def interval(self, value):
 48 |         if not isinstance(value, (int, float)):
 49 |             raise errors.TaskError("interval应为int或float型")
 50 |         self._spider.set_field("interval", value)
 51 | 
 52 |     def freeze(self, seconds):
 53 |         """暂停调度seconds秒"""
 54 |         last_pop_time = time.time() + seconds - self.interval
 55 |         self._spider.set_field("last_pop_time", last_pop_time)
 56 |         logger.info("暂停调度", seconds)
 57 | 
 58 |     def stop(self):
 59 |         """停止调度，爬虫状态更改为'stop'"""
 60 |         logger.info("爬虫状态更改为'stop'")
 61 |         self._spider.set_field("status", -2)
 62 | 
 63 |     def finish(self):
 64 |         """
 65 |         提前完成调度，爬虫状态更改为'finish'
 66 |         默认情况下，当所有队列均为空时，爬虫状态自动变为'finish'
 67 |         """
 68 |         logger.info("爬虫状态更改为'finish'")
 69 |         self._spider.set_field("status", 0)
 70 | 
 71 | 
 72 | class UserAgent(object):
 73 |     _ua = None
 74 |     _browsers = None
 75 | 
 76 |     def __getitem__(self, item):
 77 |         if UserAgent._ua is None:
 78 |             filename = os.path.join(os.path.dirname(__file__), "user-agent.json")
 79 |             with open(filename, encoding='utf8') as f:
 80 |                 UserAgent._ua = json.load(f)
 81 |                 UserAgent._browsers = list(UserAgent._ua.keys())
 82 |         if item == 'random':
 83 |             item = random.choice(UserAgent._browsers)
 84 |         return random.choice(UserAgent._ua[item])
 85 | 
 86 |     # 便于IDE提示
 87 |     @property
 88 |     def chrome(self):
 89 |         return self["chrome"]
 90 | 
 91 |     @property
 92 |     def ie(self):
 93 |         return self["ie"]
 94 | 
 95 |     @property
 96 |     def safari(self):
 97 |         return self["safari"]
 98 | 
 99 |     @property
100 |     def firefox(self):
101 |         return self["firefox"]
102 | 
103 |     @property
104 |     def android(self):
105 |         return self["android"]
106 | 
107 |     @property
108 |     def random(self):
109 |         return self["random"]
110 | 
111 | 
112 | class CSS(object):
113 |     def __init__(self, root, pattern=":root"):
114 |         if isinstance(root, (element.Tag, type(None))):
115 |             self._root = root
116 |         elif isinstance(root, str):
117 |             self._root = BeautifulSoup(root, "lxml")
118 |         else:
119 |             raise errors.TaskError(f"不支持从'{type(root)}'类型构造CSS")
120 | 
121 |         self._pattern = pattern
122 |         self._default = ArgDefault
123 | 
124 |     def __bool__(self):
125 |         return self._root is not None
126 | 
127 |     def __repr__(self):
128 |         return f"CSS('{self._pattern}')"
129 | 
130 |     def one(self, pattern):
131 |         node = self._root.select_one(pattern)
132 |         return CSS(node, pattern)  # type: CSS
133 | 
134 |     def many(self, pattern) -> List['CSS']:
135 |         nodes = self._root.select(pattern)
136 |         return [CSS(node, pattern) for node in nodes]
137 | 
138 |     def exist(self, pattern):
139 |         return bool(self.one(pattern))
140 | 
141 |     def default(self, value):
142 |         self._default = value
143 |         return self
144 | 
145 |     def text(self, regex=None, strip=True, separator=""):
146 |         if self._root is None:
147 |             if self._default is ArgDefault:
148 |                 raise errors.TaskError(f"未找到:{repr(self)}")
149 |             else:
150 |                 # 默认值不校验格式，直接返回
151 |                 return self._default
152 |         _text = self._root.get_text(separator, strip)
153 |         if regex is None or re.match(regex, _text):
154 |             return _text
155 |         else:
156 |             raise errors.TaskError(f"未通过正则校验:{regex}")
157 | 
158 |     def html(self):
159 |         if self._root is None:
160 |             if self._default is ArgDefault:
161 |                 raise errors.TaskError(f"未找到:{repr(self)}")
162 |             else:
163 |                 # 默认值不校验格式，直接返回
164 |                 return self._default
165 |         return str(self._root)
166 | 
167 |     @property
168 |     def attrs(self):
169 |         return self._root.attrs
170 | 
171 | 
172 | class XPath(object):
173 |     def __init__(self, root, pattern="/*"):
174 |         if isinstance(root, (etree._Element, type(None))):
175 |             self._root = root
176 |         elif isinstance(root, str):
177 |             self._root = etree.HTML(root)
178 |         else:
179 |             raise errors.TaskError(f"不支持从'{type(root)}'类型构造XPath")
180 | 
181 |         self._pattern = pattern
182 |         self._default = ArgDefault
183 | 
184 |     def __bool__(self):
185 |         return self._root is not None
186 | 
187 |     def __repr__(self):
188 |         return f"XPath('{self._pattern}')"
189 | 
190 |     def one(self, pattern):
191 |         nodes = self._root.xpath(pattern)
192 |         if nodes:
193 |             return XPath(nodes[0])
194 |         else:
195 |             return XPath(None)
196 | 
197 |     def many(self, pattern):
198 |         nodes = self._root.xpath(pattern)
199 |         return [XPath(node, pattern) for node in nodes]
200 | 
201 |     def exist(self, pattern):
202 |         return bool(self.one(pattern))
203 | 
204 |     def default(self, value):
205 |         self._default = value
206 |         return self
207 | 
208 |     def text(self, regex=None, strip=True):
209 |         if self._root is None:
210 |             if self._default is ArgDefault:
211 |                 raise errors.TaskError(f"未找到{repr(self)}")
212 |             else:
213 |                 # 默认值不校验格式，直接返回
214 |                 return self._default
215 |         _text = self._root.text
216 |         _text = '' if _text is None else _text
217 |         _text = _text.strip() if strip else _text
218 |         if regex is None or re.match(regex, _text):
219 |             return _text
220 |         else:
221 |             raise errors.TaskError(f"未通过正则校验:{regex}")
222 | 
223 |     @property
224 |     def attrs(self):
225 |         return self._root.attrib
226 | 
227 | 
228 | class Regex(object):
229 |     def __init__(self, root):
230 |         self._root = root
231 | 
232 |     def __bool__(self):
233 |         return self._root is not None
234 | 
235 |     def many(self, pattern):
236 |         return re.findall(pattern, self._root)
237 | 
238 | 
239 | class Response(object):
240 |     def __init__(self, resp: requests.Response):
241 |         self._resp = resp
242 |         self.encoding = "utf-8"
243 |         # 解析器
244 |         self._css = None  # type: CSS
245 |         self._xpath = None  # type: XPath
246 |         self._json = None  # type: dict
247 |         self._re = None  # type: Regex
248 | 
249 |         self.content = resp.content
250 |         self.status_code = resp.status_code
251 |         self.url = resp.url
252 |         self.furl = furl.furl(resp.url)
253 |         self.request = resp.request  # type: requests.PreparedRequest
254 |         self.history = resp.history  # type: list
255 |         self.cookies = resp.cookies  # type: dict
256 |         self.headers = resp.headers  # type: dict
257 | 
258 |     @property
259 |     def re(self) -> Regex:
260 |         if not self._re:
261 |             self._re = Regex(self.text)
262 |         return self._re
263 | 
264 |     @property
265 |     def text(self) -> str:
266 |         self._resp.encoding = self.encoding
267 |         return self._resp.text
268 | 
269 |     @property
270 |     def json(self) -> dict:
271 |         if self._json:
272 |             return self._json
273 |         try:
274 |             self._json = self._resp.json()
275 |             return self._json
276 |         except simplejson.errors.JSONDecodeError:
277 |             raise errors.JSONDecodeError
278 | 
279 |     @property
280 |     def css(self) -> CSS:
281 |         if self._css is None:
282 |             self._css = CSS(self.content.decode(self.encoding))
283 |         return self._css
284 | 
285 |     @property
286 |     def xpath(self) -> XPath:
287 |         if self._xpath is None:
288 |             self._xpath = XPath(self.content.decode(self.encoding))
289 |         return self._xpath
290 | 
291 |     def __repr__(self):
292 |         return f"Response({self.status_code})"
293 | 
294 | 
295 | class Tracking(object):
296 |     """数据埋点"""
297 |     prefix = 'tracking'
298 | 
299 |     def __init__(self, name, db):
300 |         self._name = name
301 |         self._db = db
302 | 
303 |     def incr(self, field, amount=1):
304 |         return self._db.incr(f"{self.prefix}:{self._name}:{field}", amount)
305 | 
306 |     def get(self, field):
307 |         r = self._db.get(f"{self.prefix}:{self._name}:{field}")
308 |         return int(r) if r else None
309 | 
310 |     @property
311 |     def fields(self):
312 |         return [i.decode().split(":", 2)[2] for i in self._db.keys(f"{self.prefix}:{self._name}:*")]
313 | 
314 | 
315 | class Client(object):
316 |     """封装requests，便于包装响应包、掌管代理"""
317 | 
318 |     def __init__(self, name, db, address=None):
319 |         self._address = address
320 |         self._set_address(address)
321 |         self.name = name
322 |         self.headers = {}
323 |         self._db = db  # type: redis.StrictRedis
324 |         self._session = requests
325 |         self._reuse = False
326 | 
327 |     def session(self):
328 |         """返回跨请求保留Cookie的客户端"""
329 |         client = Client(self.name, self._db, self._address)
330 |         client._session = requests.session()
331 |         return client
332 | 
333 |     def request(self, method, url, **kwargs):
334 |         try:
335 |             headers = {**self.headers, **kwargs.pop("headers", {})}
336 |             proxies = {**self.proxies, **kwargs.pop("proxies", {})}
337 |             resp = self._session.request(
338 |                 method, url,
339 |                 headers=headers,
340 |                 proxies=proxies,
341 |                 **kwargs
342 |             )
343 |         except requests.exceptions.Timeout as e:
344 |             raise errors.Timeout(e)
345 |         except requests.exceptions.ProxyError as e:
346 |             raise errors.ProxyError(e)
347 |         except requests.exceptions.RequestException as e:
348 |             raise errors.RequestError(e)
349 |         except Exception as e:
350 |             raise e
351 |         return Response(resp)
352 | 
353 |     def get(self, url, params=None, **kwargs):
354 |         return self.request("get", url, params=params, **kwargs)
355 | 
356 |     def post(self, url, data=None, json=None, **kwargs):
357 |         return self.request("post", url, data=data, json=json, **kwargs)
358 | 
359 |     def head(self, url, **kwargs):
360 |         return self.request("head", url, **kwargs)
361 | 
362 |     def options(self, url, **kwargs):
363 |         return self.request("options", url, **kwargs)
364 | 
365 |     def patch(self, url, data=None, **kwargs):
366 |         return self.request("patch", url, data=data, **kwargs)
367 | 
368 |     def put(self, url, data=None, **kwargs):
369 |         return self.request("put", url, data=data, **kwargs)
370 | 
371 |     def delete(self, url, **kwargs):
372 |         return self.request("delete", url, **kwargs)
373 | 
374 |     def _set_address(self, address):
375 |         if address:
376 |             proxy = address.split(":", 2)[2]
377 |             self.proxies = {
378 |                 "http": proxy,
379 |                 "https": proxy
380 |             }
381 |             self.proxy = proxy
382 |             self.address = address
383 |             logger.debug("设置代理", proxy)
384 |         else:
385 |             self.proxies = {}
386 |             self.proxy = None
387 |             self.address = None
388 | 
389 |     def reload_proxy(self) -> bool:
390 |         """丢弃当前代理并更换新代理，若代理池已无可用代理，返回False"""
391 |         recycle = []
392 |         try:
393 |             while True:
394 |                 address = self._db.rpop(f"proxy:addresses:{self.name}")
395 |                 # 代理池空了
396 |                 if not address:
397 |                     raise TaskBreak(0)
398 |                 address = address.decode()  # type: str
399 |                 _valid_at, _expire_at, _ = address.split(":", 2)
400 |                 valid_at, expire_at = float(_valid_at), float(_expire_at)
401 |                 # 未到可用时间，还回去
402 |                 if valid_at > time.time():
403 |                     recycle.append(address)
404 |                     continue
405 |                 # 已到可用时间，但过期了，直接丢弃
406 |                 if expire_at < time.time():
407 |                     continue
408 |                 self._set_address(address)
409 |                 return True
410 |         finally:
411 |             if recycle:
412 |                 self._db.lpush(f"proxy:addresses:{self.name}", *recycle)
413 | 
414 |     def reuse_proxy(self, freeze=0):
415 |         """回收代理，并在freeze秒后可再次被分配"""
416 |         # 只可reuse一次
417 |         if self._reuse:
418 |             return
419 |         else:
420 |             self._reuse = True
421 |         if self.address:
422 |             _, expire_at, proxy = self.address.split(":", 2)
423 |             valid_at = time.time() + freeze
424 |             self._db.lpush(f"proxy:addresses:{self.name}", f"{valid_at}:{expire_at}:{proxy}")
425 |             logger.debug("回收代理", f"{valid_at}:{expire_at}:{proxy}")
426 |             self._set_address(None)
427 | 
428 |     def __setattr__(self, key, value):
429 |         if key in ['params', 'history']:
430 |             setattr(self._session, key, value)
431 |         else:
432 |             super(Client, self).__setattr__(key, value)
433 | 
434 | 
435 | class Buckets(object):
436 |     """数据存储"""
437 | 
438 |     def __init__(self, local, share):
439 |         self.local = local  # type: LocalBucket
440 |         self.share = share  # type: ShareBucket
441 | 
442 | 
443 | class Task(object):
444 |     """描述爬虫行为的抽象类"""
445 |     filters = []
446 | 
447 |     def __init__(self, name, url, db, address):
448 |         """
449 |         Args:
450 |             name: 爬虫名
451 |             url: 当前URL
452 |             db: redis数据库（用户不可使用）
453 |             address: 代理地址
454 |         """
455 |         self._spider = scheduler.Spider(db, name)
456 |         self._queue = scheduler.Queue(db, name)
457 |         self._db = db  # type: redis.StrictRedis
458 | 
459 |         self.url = url  # type: str
460 |         self.furl = furl.furl(url)
461 |         self.name = name  # type: str
462 |         self.logger = logging.getLogger(name)
463 |         self.client = Client(name, db, address)
464 |         self.queue = Queue(db, name, url)
465 |         self.ua = UserAgent()
466 |         self.buckets = Buckets(LocalBucket.instance(name), ShareBucket(db, name))
467 |         self.args = self._spider.get_field("args")
468 |         self.lock = self._db.lock  # 分布式锁
469 |         self.tracking = Tracking(name, db)
470 |         self.result = None
471 |         self.response = None  # type: Response
472 | 
473 |     def on_download(self) -> Response:
474 |         """下载并返回响应包"""
475 |         raise NotImplementedError()
476 | 
477 |     def on_parse(self) -> dict:
478 |         """提取并返回目标数据"""
479 |         return {}
480 | 
481 |     def on_link(self) -> Union[list, dict]:
482 |         """
483 |         提取并返回新链接
484 |         Returns:
485 |             links: links可以是list和dict两种类型
486 |                 dict: 指定不同的优先级: {priority: urls}
487 |                 list: 将links中的url添加到优先级为1的队列中
488 |                       相当于: {1: urls}
489 |         """
490 | 
491 |     def on_save(self):
492 |         """存储数据"""
493 |         self.logger.debug("on_save", self.result)
494 | 
495 |     def on_finish(self):
496 |         """已完成"""
497 | 
498 |     def on_error(self, e) -> bool:
499 |         """
500 |         处理生命周期中抛出的异常（包括on_finish）
501 |         Returns:
502 |             True: 异常已被处理
503 |             False: 异常无法处理
504 |         """
505 |         return False
506 | 
507 | 
508 | def execute(task: Task):
509 |     """
510 |     运行task实例并处理所有异常
511 |     Returns:
512 |         links: {priority: urls}
513 |     """
514 |     try:
515 |         task.tracking.incr('on_download')
516 |         task.response = task.on_download()
517 |         task.tracking.incr('on_download_ok')
518 |         task.result = task.on_parse()
519 |         links = task.on_link()
520 |         if isinstance(links, list):
521 |             links = {3: links}
522 |         elif links is None:
523 |             links = {}
524 |         elif not isinstance(links, dict):
525 |             raise errors.TaskError(f"on_link返回值应是list或dict型，而非{type(links)}")
526 |         task.on_save()
527 |         task.on_finish()
528 |         return links
529 |     except errors.TaskFinish:
530 |         logger.debug("TaskFinish", task.url)
531 |         task.on_finish()
532 |         return {}
533 |     except errors.TaskBreak as e:
534 |         logger.debug("TaskBack", e.priority, task.url)
535 |         task._queue.insert(task.url, e.priority)
536 |         return {}
537 |     except errors.TaskError as e:
538 |         task._queue.report_error(e.__class__.__name__, task.url)
539 |         logger.warning("Task报告的异常", str(e), task.url)
540 |         return {}
541 |     except Exception as e:
542 |         if task.on_error(e):
543 |             return {}
544 |         task._queue.report_error("unknown", task.url)
545 |         logger.error(f"Task未处理的异常", "unknown", task.url)
546 |         traceback.print_exc()
547 |         return {}
548 | 


--------------------------------------------------------------------------------
/pyloom/user-agent.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "chrome": [
  3 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
  4 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
  5 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
  6 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
  7 |     "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
  8 |     "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
  9 |     "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
 10 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
 11 |     "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
 12 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
 13 |     "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
 14 |     "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
 15 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
 16 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
 17 |     "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
 18 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
 19 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
 20 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
 21 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
 22 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
 23 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
 24 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
 25 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
 26 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
 27 |     "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
 28 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
 29 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
 30 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
 31 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
 32 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
 33 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
 34 |     "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
 35 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
 36 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
 37 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
 38 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
 39 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
 40 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 41 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 42 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 43 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 44 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 45 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 46 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
 47 |     "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
 48 |     "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
 49 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
 50 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
 51 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
 52 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
 53 |   ],
 54 |   "firefox": [
 55 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
 56 |     "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
 57 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
 58 |     "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
 59 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
 60 |     "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
 61 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
 62 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
 63 |     "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
 64 |     "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
 65 |     "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
 66 |     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
 67 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
 68 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
 69 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
 70 |     "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
 71 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
 72 |     "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
 73 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
 74 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
 75 |     "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
 76 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
 77 |     "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
 78 |     "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
 79 |     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
 80 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
 81 |     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
 82 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
 83 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
 84 |     "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
 85 |     "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
 86 |     "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
 87 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
 88 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
 89 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
 90 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
 91 |     "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
 92 |     "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
 93 |     "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
 94 |     "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
 95 |     "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
 96 |     "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
 97 |     "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
 98 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
 99 |     "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
100 |     "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
101 |     "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
102 |     "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
103 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
104 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6"
105 |   ],
106 |   "safari": [
107 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
108 |     "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
109 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
110 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
111 |     "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3",
112 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
113 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
114 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
115 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
116 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
117 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
118 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
119 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
120 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
121 |     "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
122 |     "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
123 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
124 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
125 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
126 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
127 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
128 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
129 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
130 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
131 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
132 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
133 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
134 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
135 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
136 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
137 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
138 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
139 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
140 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
141 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
142 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
143 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
144 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
145 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
146 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
147 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
148 |     "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
149 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
150 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
151 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
152 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
153 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
154 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
155 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
156 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5"
157 |   ],
158 |   "ie": [
159 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
160 |     "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
161 |     "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
162 |     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
163 |     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
164 |     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
165 |     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
166 |     "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
167 |     "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
168 |     "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
169 |     "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
170 |     "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
171 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
172 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
173 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
174 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
175 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
176 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
177 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
178 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
179 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
180 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
181 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
182 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
183 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
184 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
185 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
186 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
187 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
188 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
189 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
190 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
191 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
192 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
193 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
194 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
195 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
196 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
197 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
198 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
199 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
200 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
201 |     "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
202 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
203 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
204 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
205 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
206 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
207 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
208 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)"
209 |   ],
210 |   "android": [
211 |     "android Mozilla/5.0 (Linux; Android 8.0.0; ATU-AL10 Build/HUAWEIATU-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36  phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/hw",
212 |     "android Mozilla/5.0 (Linux; Android 8.1.0; ONEPLUS A5000 Build/OPM1.171019.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/65.0.3325.109 Mobile Safari/537.36  phh_android_version/4.17.1 phh_android_build/1a7ec8b149 phh_android_channel/oppo"
213 |   ]
214 | }
215 | 


--------------------------------------------------------------------------------
/pyloom/utils.py:
--------------------------------------------------------------------------------
  1 | """小工具"""
  2 | import re
  3 | import os
  4 | import sys
  5 | import time
  6 | import uuid
  7 | import types
  8 | import logging
  9 | import readline
 10 | import functools
 11 | import itertools
 12 | import importlib
 13 | from pyloom.errors import *
 14 | from importlib.machinery import SourceFileLoader
 15 | from logging.handlers import TimedRotatingFileHandler
 16 | 
 17 | logger = logging.getLogger("utils")
 18 | 
 19 | 
 20 | class ArgDefault(object):
 21 |     """默认参数"""
 22 | 
 23 |     def __bool__(self):
 24 |         return False
 25 | 
 26 | 
 27 | def patch_logger_format():
 28 |     """使logger支持用多个参数构造日志内容"""
 29 |     log_bak = logging.Logger._log
 30 | 
 31 |     def log(self, level, msg, *args):
 32 |         gap = ' '
 33 |         out = str(msg) + gap
 34 |         for value in args[0]:
 35 |             out = out + str(value) + gap
 36 |         log_bak(self, level, out, [])
 37 | 
 38 |     logging.Logger._log = log
 39 | 
 40 | 
 41 | def patch_handler_color(handler):
 42 |     """使handler支持根据日志级别输出彩色日志"""
 43 |     emit_bak = handler.emit
 44 | 
 45 |     def emit(*args):
 46 |         level = args[0].levelno
 47 |         if level >= 50:
 48 |             color = '\x1b[31m'  # red, critical
 49 |         elif level >= 40:
 50 |             color = '\x1b[31m'  # red, error
 51 |         elif level >= 30:
 52 |             color = '\x1b[33m'  # yellow, warning
 53 |         elif level >= 20:
 54 |             color = '\x1b[32m'  # green, info
 55 |         elif level >= 10:
 56 |             color = '\x1b[35m'  # pink, debug
 57 |         else:
 58 |             color = '\x1b[0m'  # normal
 59 |         args[0].msg = color + args[0].msg + '\x1b[0m'
 60 |         return emit_bak(*args)
 61 | 
 62 |     handler.emit = emit
 63 | 
 64 | 
 65 | class RedisScripts(object):
 66 |     """管理redis-lua脚本"""
 67 |     _scripts = {}
 68 | 
 69 |     @classmethod
 70 |     def load(cls, db):
 71 |         path = os.path.join(os.path.dirname(__file__), 'lua')
 72 |         for filename in os.listdir(path):
 73 |             lua_file = os.path.join(path, filename)
 74 |             with open(lua_file, encoding="utf-8") as f:
 75 |                 sha1 = db.script_load(f.read())
 76 |                 command = filename.split('.')[0]
 77 |                 RedisScripts._scripts[command] = sha1
 78 |                 logger.info("缓存Lua脚本", command, sha1)
 79 | 
 80 |     @classmethod
 81 |     def sha1(cls, command):
 82 |         return RedisScripts._scripts[command]
 83 | 
 84 | 
 85 | def dict_merge(base: dict, delta: dict, check_not_none=True) -> dict:
 86 |     """
 87 |     将delta递归合并至base，覆盖同名字段
 88 |     若is_not_none为True，
 89 |         合并后不应有值为ConfigNotNone，否则抛出ConfigNotNone异常
 90 |     Example:
 91 |         # 递归合并，修改实参
 92 |         >>> base =  {'redis': {'host': '127.0.0.1', 'port': 6379}}
 93 |         >>> delta = {'redis': {'host': '192.168.1.1'}}
 94 |         >>> dict_merge(base, delta)
 95 |         {'redis': {'host': '192.168.1.1', 'port': 6379}}
 96 |         >>> base
 97 |         {'redis': {'host': '192.168.1.1', 'port': 6379}}
 98 | 
 99 |         # 参数check_not_none
100 |         >>> base =  {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}}
101 |         >>> delta = {'redis': {'host': '192.168.1.1'}}
102 |         >>> dict_merge(base, delta, True)
103 |         Traceback (most recent call last):
104 |         ...
105 |         pyloom.errors.ConfigNotNone: 缺少配置项:'port'
106 |         >>> base =  {'redis': {'host': '127.0.0.1', 'port': ConfigNotNone}}
107 |         >>> dict_merge(base, delta, False)
108 |         {'redis': {'host': '192.168.1.1', 'port': <class 'pyloom.errors.ConfigNotNone'>}}
109 |     """
110 |     if not isinstance(base, dict):
111 |         return delta
112 |     common_keys = set(base).intersection(delta)
113 |     new_keys = set(delta).difference(common_keys)
114 |     for key in common_keys:
115 |         base[key] = dict_merge(base[key], delta[key], check_not_none)
116 |     for key in new_keys:
117 |         base[key] = delta[key]
118 |     if check_not_none:
119 |         for key in base:
120 |             if base[key] is ConfigNotNone:
121 |                 raise ConfigNotNone(key)
122 |     return base
123 | 
124 | 
125 | def retry(tries=-1, delay=1, max_delay=None, backoff=0, catches=None, error=None):
126 |     """
127 |     自动重试
128 | 
129 |     当delay=1,backoff=0时，依此休眠：
130 |         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
131 |     当delay=1,backoff=1时，依此休眠：
132 |         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
133 |     当delay=1,backoff=2时，依此休眠：
134 |         [1, 2, 5, 10, 17, 26, 37, 50, 65, 82]
135 | 
136 |     Args:
137 |         tries: 重试次数，（-1:不限重试次数）
138 |         delay: 初始重试秒数
139 |         max_delay: 最大重试秒数（None:不限）
140 |         backoff: 退避指数
141 |         catches: 可被捕捉的异常（RetryError始终可用）
142 |         error: 达到最大重试次数时抛出的异常（默认RetryExceeded）
143 |     """
144 |     if catches is None:
145 |         catches = []
146 | 
147 |     def decorator(func):
148 |         @functools.wraps(func)
149 |         def wrapper(*args, **kwargs):
150 |             # 记数生成器
151 |             if tries >= 0:
152 |                 count = range(tries)
153 |             else:
154 |                 count = itertools.count()
155 |             # 处理重试
156 |             for i in count:
157 |                 setattr(wrapper, "count", i)
158 |                 try:
159 |                     return func(*args, **kwargs)
160 |                 except (RetryError, *catches):
161 |                     if backoff == 0:
162 |                         sleep = delay
163 |                     else:
164 |                         sleep = delay + i ** backoff
165 |                     if max_delay:
166 |                         sleep = min(sleep, max_delay)
167 |                     time.sleep(sleep)
168 |             # 重试次数超限
169 |             else:
170 |                 if error is None:
171 |                     raise RetryExceeded
172 |                 else:
173 |                     raise error
174 | 
175 |         return wrapper
176 | 
177 |     return decorator
178 | 
179 | 
180 | def template_input(template):
181 |     """
182 |     在命令行提示用户输入配置
183 |     Args:
184 |         template: 配置模板
185 |             例如完成员工信息填写：
186 |             ArgDefault表示必填参数，无默认值
187 |             [
188 |                 {
189 |                     "name": 配置名,
190 |                     "title": 配置标题,
191 |                     "example": 示例,
192 |                     "default": 默认值（留空表示必填参数）,
193 |                     "note": 提示信息,
194 |                     "type": 类型转换函数
195 |                 }
196 |             ]
197 |     """
198 |     configs = {}
199 |     for fields in template:
200 |         name = fields['name']
201 |         default = fields.get('default', ArgDefault)
202 |         example = fields.get('example', ArgDefault)
203 |         note = fields.get('note', ArgDefault)
204 |         title = fields.get('title', name)
205 |         regex = fields.get('regex', ArgDefault)
206 |         _type = fields.get('type', ArgDefault)
207 |         _range = fields.get('range', ArgDefault)
208 |         if _type is not ArgDefault:
209 |             output = f"{title}[{_type.__name__}]\n"
210 |         else:
211 |             output = f"{title}\n"
212 |         if example is not ArgDefault:
213 |             output += f"示例: {example}\n"
214 |         if note is not ArgDefault:
215 |             output += f"提示: {note}\n"
216 |         output += '➜ '
217 |         first = True
218 |         while True:
219 |             if first:
220 |                 var = input(output)
221 |                 first = False
222 |             else:
223 |                 var = input('➜ ')
224 |             if var:
225 |                 # 类型检查
226 |                 if _type is not ArgDefault:
227 |                     try:
228 |                         var = _type(var)
229 |                     except ValueError:
230 |                         print(f"参数类型有误，请重试")
231 |                         continue
232 |                 # 范围检查
233 |                 if _range is not ArgDefault and var not in _range:
234 |                     print(f"参数范围有误，请重试")
235 |                     continue
236 |                 # 正则检查
237 |                 if regex is not ArgDefault and not re.match(regex, var):
238 |                     print(f"参数格式有误，请重试")
239 |                     continue
240 |                 break
241 |             elif not var and default is not ArgDefault:
242 |                 var = default
243 |                 break
244 |             else:
245 |                 print(f"参数不可留空，请重试")
246 |         configs[name] = var
247 |     return configs
248 | 
249 | 
250 | def load_py_configs(file) -> dict:
251 |     """
252 |     加载PY格式的配置文件，当配置文件为空时，返回{}
253 |     Args:
254 |         file: 配置文件路径
255 |     """
256 |     if not os.path.exists(file):
257 |         raise ConfigFileNotFoundError(file)
258 |     m = SourceFileLoader(uuid.uuid4().hex, file).load_module()
259 |     return {k: v for k, v in vars(m).items() if not k.startswith('__')}
260 | 
261 | 
262 | def load_spider_configs(path) -> dict:
263 |     """
264 |     加载爬虫配置
265 |     Args:
266 |         path: 爬虫目录
267 |     """
268 |     _configs = {
269 |         "seeders": ConfigNotNone,
270 |         "interval": 3,
271 |         "timeout": 120,
272 |         "precision": 0.0001,
273 |         "args": {}
274 |     }
275 |     conf = os.path.join(path, 'configs.py')
276 |     if not os.path.exists(conf):
277 |         raise ConfigFileNotFoundError(f"ERR: 未找到爬虫配置:'{conf}'")
278 |     return dict_merge(_configs, load_py_configs(conf))
279 | 
280 | 
281 | def tail(file):
282 |     """模仿linux中的tail命令"""
283 |     try:
284 |         with open(file, 'rb') as f:
285 |             for i in range(1, 11):
286 |                 try:
287 |                     f.seek(-(i ** 3), 2)
288 |                 except OSError:
289 |                     f.seek(-((i - 1) ** 3), 2)
290 |                     break
291 |             while True:
292 |                 line = f.readline()
293 |                 if not line:
294 |                     time.sleep(0.1)
295 |                     continue
296 |                 try:
297 |                     yield line.decode('utf8')
298 |                 except UnicodeDecodeError:
299 |                     time.sleep(0.1)
300 |                     continue
301 |     except KeyboardInterrupt:
302 |         pass
303 | 


--------------------------------------------------------------------------------
/pyloom/worker.py:
--------------------------------------------------------------------------------
  1 | import redis
  2 | import signal
  3 | import traceback
  4 | import threading
  5 | import multiprocessing
  6 | from . import buckets
  7 | from .utils import *
  8 | from .tasks import Task, execute
  9 | from .scheduler import Spider, Queue
 10 | 
 11 | logger = logging.getLogger("worker")
 12 | 
 13 | 
 14 | def worker_process(redis_conf, spiders, threads, token_curr, token_new):
 15 |     """
 16 |     Worker子进程，负责启动线程
 17 |     Args:
 18 |         redis_conf: redis数据库
 19 |         spiders: 所有爬虫配置表，{name: (path, version)}
 20 |         threads: 线程数
 21 |         token_curr: 新建进程时的token
 22 |         token_new: 父进程中最新的token
 23 |             当token_curr与token_new不同时，表示父进程已更新了路由，
 24 |             线程在完成当前生命周期后需自行退出
 25 |     """
 26 |     logger.debug("Worker进程已启动")
 27 |     # Manager的共享变量在并发启动过多进程时会出现ConnectionRefusedError
 28 |     for _ in range(60):
 29 |         try:
 30 |             spiders.items()
 31 |             break
 32 |         except Exception:
 33 |             pass
 34 |     else:
 35 |         logger.fatal("Worker进程退出，spiders超时未就绪")
 36 |         return
 37 | 
 38 |     thread_ids = []
 39 |     # 构造路由，{name: [[regex, task]...]}
 40 |     routers = {}
 41 |     for name, (path, version) in spiders.items():
 42 |         tasks = import_tasks(path)
 43 |         if tasks:
 44 |             routers[name] = tasks
 45 |             logger.info("载入爬虫成功", name, version)
 46 |         else:
 47 |             logger.info("载入爬虫失败，未发现合规Task类", name, version)
 48 |     # 启动线程
 49 |     try:
 50 |         logger.info("正在启动Worker线程")
 51 |         signal.signal(signal.SIGINT, signal.SIG_IGN)  # 忽略Ctrl+C
 52 |         for thread_index in range(threads):
 53 |             thread = threading.Thread(
 54 |                 target=worker_thread,
 55 |                 args=(redis_conf, routers, token_curr, token_new, thread_index)
 56 |             )
 57 |             thread.start()
 58 |             thread_ids.append(thread)
 59 |         logger.info("Worker线程启动成功")
 60 |     except Exception as e:
 61 |         logger.fatal("Worker进程结束，启动Worker线程时出现异常", e, '\n', traceback.format_exc())
 62 |         return
 63 | 
 64 |     for i in itertools.count():
 65 |         try:
 66 |             # 清理进程内的过期键
 67 |             if i % 500 == 0:
 68 |                 count = buckets.LocalBucket.purge()
 69 |                 if count:
 70 |                     logger.debug(f"完成清理LocalBucket", count)
 71 |             # 线程全部退出后，结束进程
 72 |             if not any([t.is_alive() for t in thread_ids]):
 73 |                 logger.info("Worker进程结束，线程已全部退出")
 74 |                 return
 75 |             time.sleep(2)
 76 |         except Exception as e:
 77 |             logger.fatal("Worker进程异常", e, '\n', traceback.format_exc())
 78 |             time.sleep(5)
 79 | 
 80 | 
 81 | def worker_thread(redis_conf, routers, token_curr, token_new, thread_index):
 82 |     """
 83 |     循环：申请任务->执行任务->上报结果
 84 |     线程内捕捉所有异常，永不退出（Ctrl+C除外）
 85 |     """
 86 |     logger.debug("Worker线程已启动")
 87 |     db = redis.StrictRedis.from_url(redis_conf)
 88 |     pop_failure_count = 0
 89 |     while True:
 90 |         try:
 91 |             # 结束线程
 92 |             try:
 93 |                 if token_curr != token_new.value:
 94 |                     logger.info("Worker线程结束，收到退出信号")
 95 |                     return
 96 |             except ConnectionRefusedError:
 97 |                 logger.debug("Token未就绪")
 98 |                 time.sleep(1)
 99 |                 continue
100 |             except (BrokenPipeError, FileNotFoundError, EOFError):
101 |                 logger.info("Worker线程结束，Token已关闭")
102 |                 return
103 |             # 从队列中弹出URL
104 |             if not routers:
105 |                 logger.info("本地爬虫列表为空，等待加载爬虫")
106 |                 while not routers:
107 |                     time.sleep(1)
108 |             keys = list(routers.keys())
109 |             url, name, address = Queue.pop(db, keys)
110 |             if not url:
111 |                 if pop_failure_count % 20 == 0:  # 避免日志过多
112 |                     logger.debug("暂无已就绪任务，稍后重试")
113 |                 time.sleep(thread_index / 10 + 0.1)
114 |                 pop_failure_count += 1
115 |                 continue
116 |             logger.info("获得任务", name, url, address)
117 |             pop_failure_count = 0
118 |             # 匹配Task类并执行
119 |             tasks = routers.get(name, None)
120 |             queue = Queue(db, name)
121 |             if tasks is None:
122 |                 logger.warning("爬虫匹配失败", name, url)
123 |                 queue.report_error("none_spider", url)
124 |                 continue
125 |             for regex, task_cls in tasks:
126 |                 if not regex.match(url):
127 |                     continue
128 |                 # 实例化Task并执行
129 |                 task = task_cls(name, url, db, address)
130 |                 links = execute(task)
131 |                 for priority, urls in links.items():
132 |                     count = queue.add(urls, priority)
133 |                     logger.debug("添加任务", priority, f"{count}/{len(urls)}")
134 |                 logger.debug("报告任务完成", queue.report_finish(url), url)
135 |                 break
136 |             else:
137 |                 logger.warning("任务匹配失败", name, url)
138 |                 queue.report_error("none_task", url)
139 |         except Exception as e:
140 |             logger.error("Worker线程异常", e, '\n', traceback.format_exc())
141 |             time.sleep(5)
142 | 
143 | 
144 | def import_tasks(path):
145 |     """
146 |     扫描并导入爬虫模块中的Tasks
147 |     Return:
148 |         [[regex, task]...]
149 |     """
150 |     tasks = []
151 |     # 导入模块
152 |     parent = os.path.dirname(path)
153 |     if parent not in sys.path:
154 |         sys.path.append(parent)
155 |     basename = os.path.basename(path)
156 |     try:
157 |         logger.debug("加载爬虫模块", basename)
158 |         _module = importlib.import_module(basename)
159 |     except Exception as e:
160 |         logger.error("加载爬虫模块异常", e, '\n', traceback.format_exc())
161 |         return []
162 |     # 扫描模块中合规的Task子类
163 |     # 何为合规？
164 |     # 1.Task的子类; 2.filters成员; 3.导入无异常; 4.名称不以'__'开头
165 |     for name in dir(_module):
166 |         if name.startswith("__"):
167 |             continue
168 |         var = getattr(_module, name)
169 |         try:
170 |             is_subclass = issubclass(var, Task)
171 |         except TypeError:
172 |             continue
173 |         try:
174 |             if is_subclass:
175 |                 if hasattr(var, 'filters') and isinstance(var.filters, (list, tuple, str)):
176 |                     if isinstance(var.filters, str):
177 |                         filters = [var.filters]
178 |                     else:
179 |                         filters = var.filters
180 |                     for regex in filters:
181 |                         tasks.append([re.compile(regex), var])
182 |                         logger.info("导入Task类", var.__name__)
183 |                 else:
184 |                     logger.warning("忽略Task类", var.__name__, "filters不合规")
185 |                     continue
186 |             else:
187 |                 continue
188 |         except Exception as e:
189 |             logger.error("加载Task类异常", e, '\n', traceback.format_exc())
190 |             continue
191 |     return tasks
192 | 
193 | 
194 | def start(spider_path, redis_conf, spider_configs, proxies, processes, threads):
195 |     """
196 |     重置爬虫状态后运行指定爬虫
197 |     Args:
198 |         spider_path: 爬虫目录
199 |         redis_conf: Redis配置
200 |         spider_configs: 爬虫配置
201 |         proxies: 使用代理运行
202 |         processes: 进程数量
203 |         threads: 每个进程的线程数量
204 |     """
205 |     logger.info("正在启动爬虫")
206 |     db = redis.StrictRedis.from_url(redis_conf)
207 |     name = os.path.basename(spider_path)  # 取目录名为爬虫名
208 |     RedisScripts.load(db)
209 |     spider = Spider(db, name)
210 |     # 注册爬虫/更新同名爬虫配置
211 |     logger.info("注册爬虫", name)
212 |     logger.info("爬虫配置", spider_configs)
213 |     spider.upsert(spider_configs['seeders'], spider_configs['interval'],
214 |                   spider_configs['timeout'], spider_configs['precision'],
215 |                   spider_configs['args'], proxies, time.time())
216 |     # 重置爬虫状态
217 |     status = spider.get_field("status")
218 |     if status != 10:
219 |         spider.set_field("status", 10)
220 |         logger.info(f"重置爬虫状态", "{status} -> 10")
221 |     # 回滚'timeout'异常队列
222 |     queue = Queue(db, name)
223 |     logger.debug("清理Redis")
224 |     Queue.purge(db)
225 |     logger.info("回滚超时任务")
226 |     queue.rollback_tag("timeout", 0)
227 |     # 启动Worker
228 |     logger.info("正在启动Worker")
229 |     spiders = multiprocessing.Manager().dict({name: [spider_path, 0]})
230 |     pool = []
231 |     token = multiprocessing.Manager().Value('d', 0)
232 |     for _ in range(processes):
233 |         p = multiprocessing.Process(
234 |             target=worker_process,
235 |             args=(redis_conf, spiders, threads, token.value, token)
236 |         )
237 |         p.start()
238 |         pool.append(p)
239 |     logger.info("Worker启动成功")
240 |     try:
241 |         # 循环检查爬虫状态，当爬虫停止时终止运行
242 |         while True:
243 |             time.sleep(0.2)
244 |             spider = Spider(db, name)
245 |             status = spider.get_field("status")
246 |             if status < 10:
247 |                 logger.info("爬虫停止，当前状态为:", Spider.status.get(status, "未知"))
248 |                 break
249 |     except KeyboardInterrupt:
250 |         logger.info("收到Ctrl+C", 'main')
251 |         for p in pool:
252 |             p.terminate()
253 |         logger.info("爬虫停止", "Ctrl+C")
254 |     except Exception as e:
255 |         logger.error("爬虫停止", "未知异常", e, '\n', traceback.format_exc())
256 | 
257 | 
258 | def start_all(redis_conf, spiders_path, processes, threads):
259 |     """
260 |     启动所有爬虫
261 |     Args:
262 |         redis_conf: Redis配置
263 |         spiders_path: 放置所有爬虫的目录
264 |         processes: 进程数量
265 |         threads: 每个进程的线程数量
266 |     """
267 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='pyloom',
 5 |     version='0.0.7',
 6 |     packages=['pyloom'],
 7 |     url='https://pyloom.com',
 8 |     license='https://opensource.org/licenses/MIT',
 9 |     author='pyloom',
10 |     author_email='ss@uutoto.com',
11 |     description='古老的东方有一条虫，它的名字叫爬龙。',
12 |     entry_points={
13 |         'console_scripts': [
14 |             'pyloom = pyloom.entry:main'
15 |         ]
16 |     },
17 |     install_requires=[
18 |         'redis',
19 |         'cryptography >= 2.2.1',
20 |         'requests[security, socks] >= 2.10.0',
21 |         'bs4',
22 |         'lxml',
23 |         'furl',
24 |         'simplejson',
25 |         'checksumdir',
26 |         'docutils',  # python-daemon的依赖
27 |         'python-daemon',
28 |         'tabulate',
29 |         'psutil'
30 |     ]
31 | )
32 | 


--------------------------------------------------------------------------------
/spiders/DouBan250/README.md:
--------------------------------------------------------------------------------
1 | ## 豆瓣TOP250电影爬虫
2 | 
3 | 用于演示编写最基本的爬虫、测试新功能。


--------------------------------------------------------------------------------
/spiders/DouBan250/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyloom.tasks import *
 2 | 
 3 | 
 4 | class DouBanTask(Task):
 5 |     filters = ["^https://movie.douban.com/top250(\?start=\d+)?$"]
 6 | 
 7 |     def on_download(self):
 8 |         return self.client.get(
 9 |             url=self.url,
10 |             headers={
11 |                 "Host": "movie.douban.com",
12 |                 "User-Agent": self.ua.chrome
13 |             }
14 |         )
15 | 
16 |     def on_parse(self):
17 |         nodes = self.response.css.many("div.article ol > li")
18 |         return [{
19 |             "title": node.one("span.title").text(),
20 |             "rating": node.one("span.rating_num").text(),
21 |             "quote": node.one("p.quote > span.inq").text()
22 |         } for node in nodes]
23 | 
24 |     def on_link(self):
25 |         if self.url.endswith("top250"):
26 |             return [f"{self.url}?start={i}" for i in range(25, 250, 25)]
27 | 
28 |     def on_save(self):
29 |         for movie in self.result:
30 |             self.logger.info("抓到电影", movie)
31 | 


--------------------------------------------------------------------------------
/spiders/DouBan250/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "https://movie.douban.com/top250"
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = 5
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/DouBanBooks/README.md:
--------------------------------------------------------------------------------
 1 | ## 豆瓣图书爬虫
 2 | 
 3 | ### 图书信息（BookDetailsTask）
 4 | 
 5 | | 字段                 | 示例                                                         | 说明     |
 6 | | -------------------- | ------------------------------------------------------------ | -------- |
 7 | | result.title         | 社会研究方法                                                 | 书名     |
 8 | | result.cover         | https://img3.doubanio.com/view/subject/l/public/s2932505.jpg | 封面     |
 9 | | result.info          | {'作者': '[美]劳伦斯·纽曼', '出版社', '中国人民大学出版社'}  | 基本信息 |
10 | | result.rating_num    | 9.0                                                          | 评分     |
11 | | result.rating_people | 202                                                          | 评分人数 |
12 | | result.intro         | 迄今所见中文社会研究方法书中最好的……                         | 简介     |
13 | | result.tags          | ['社会学', '研究方法']                                       | 标签     |
14 | 
15 | ```python
16 | self.result = {
17 |     'title': '社会研究方法', 
18 |     'cover': 'https://img3.doubanio.com/view/subject/l/public/s2932505.jpg', 
19 |     'info': {
20 |         '作者': '[美]劳伦斯·纽曼', 
21 |         '出版社': '中国人民大学出版社', 
22 |         '副标题': '定性和定量的取向', 
23 |         '原作名': 'Basics of Social Research: Qualitative and Quantitative Approaches', 
24 |         '译者': '郝大海', 
25 |         '出版年': '2007', 
26 |         '页数': '809', 
27 |         '定价': '89.80元', 
28 |         '丛书': '社会学译丛·经典教材系列', 
29 |         'ISBN': '9787300075648'
30 |     }, 
31 |     'rating_num': '9.0', 
32 |     'rating_people': '202', 
33 |     'intro': '迄今所见中文社会研究方法书中最好的一本，极力推荐研究生教学中采用。理清了许多问题，对定性和定量的对比非常精彩。', 
34 |     'tags': ['社会学', '研究方法', '方法论', '社会研究方法', '定性', '教材', '纽曼', '定量']
35 | }
36 | ```
37 | 
38 | + 基本信息（result.info），每本书的字段不固定。
39 | 
40 | 


--------------------------------------------------------------------------------
/spiders/DouBanBooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 | 


--------------------------------------------------------------------------------
/spiders/DouBanBooks/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "https://book.douban.com/tag/?view=cloud"
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = -10
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/DouBanBooks/tasks.py:
--------------------------------------------------------------------------------
  1 | from pyloom.errors import *
  2 | from pyloom.tasks import Task, CSS, retry
  3 | 
  4 | 
  5 | class BaseTask(Task):
  6 |     @retry(10, 0)
  7 |     def on_download(self):
  8 |         """下载页面"""
  9 |         try:
 10 |             response = self.client.get(
 11 |                 url=self.url,
 12 |                 allow_redirects=False,
 13 |                 headers={
 14 |                     "Host": "book.douban.com",
 15 |                     "User-Agent": self.ua.chrome
 16 |                 },
 17 |                 timeout=8
 18 |             )
 19 |         except (ProxyError, RequestError):
 20 |             self.client.reload_proxy()
 21 |             raise RetryError
 22 |         # 检查是否被封禁
 23 |         if response.status_code == 200:
 24 |             s = 'window.location.href="https://sec.douban.com/a'
 25 |             if s in response.text:
 26 |                 self.logger.warning("IP被封禁:200", self.client.address)
 27 |                 self.client.reuse_proxy(300)
 28 |             else:
 29 |                 self.client.reuse_proxy()
 30 |                 return response
 31 |         elif response.status_code == 302:
 32 |             self.logger.warning("IP被封禁:302", self.client.address)
 33 |             self.client.reuse_proxy(300)
 34 |         else:
 35 |             self.logger.warning("请求错误", response.status_code)
 36 |         self.client.reload_proxy()
 37 |         raise RetryError
 38 | 
 39 |     def parse_tag_urls(self):
 40 |         """提取页面中所有的标签链接"""
 41 |         # 获取所有标签详情页的相对路径
 42 |         paths = self.response.re.many("/tag/\w+")
 43 |         # 构造每个标签前50页的标签详情页链接，优先级为2:最低
 44 |         return [
 45 |             f"https://book.douban.com{path}?start={i*20}&type=R"
 46 |             for path in paths for i in range(50)
 47 |         ]
 48 | 
 49 | 
 50 | class BookDetailsTask(BaseTask):
 51 |     """图书详情页"""
 52 |     filters = ["https://book.douban.com/subject/(\d+)/"]
 53 | 
 54 |     def on_parse(self):
 55 |         css = self.response.css
 56 |         # 书籍基本信息
 57 |         info = {}
 58 |         for line in css.one("div#info").html().split("<br/>"):
 59 |             items = [
 60 |                 ' '.join(s.split())
 61 |                 for s in CSS(line).text(separator=" ").split(":", 1)
 62 |                 if s.strip()
 63 |             ]
 64 |             if len(items) == 2:
 65 |                 info[items[0]] = items[1]
 66 |         result = {
 67 |             "title": css.one("h1 > span").text(),
 68 |             "cover": css.one("div#mainpic img").attrs.get("src", None),
 69 |             "info": info,
 70 |             "rating_num": css.one("div.rating_self > strong.rating_num").text() or None,
 71 |             "rating_people": css.one("a.rating_people > span").default(None).text(),
 72 |             "intro": css.one("div#link-report div.intro > p").default(None).text(separator="\n"),
 73 |             "tags": [n.text() for n in css.many("div#db-tags-section a")],
 74 |         }
 75 |         return result
 76 | 
 77 |     def on_link(self):
 78 |         books = self.response.re.many("https://book.douban.com/subject/\d+/")
 79 |         # 指定优先级
 80 |         return {
 81 |             0: books,
 82 |             4: self.parse_tag_urls()
 83 |         }
 84 | 
 85 |     def on_save(self):
 86 |         self.logger.info("抓到新书", self.result)
 87 | 
 88 | 
 89 | class TagsTask(BaseTask):
 90 |     """热门标签页"""
 91 |     filters = ["https://book.douban.com/tag/\?view=cloud"]
 92 | 
 93 |     def on_link(self):
 94 |         return self.parse_tag_urls()
 95 | 
 96 | 
 97 | class TagDetailsTask(BaseTask):
 98 |     """标签详情页"""
 99 |     filters = ["https://book.douban.com/tag/(\w+)\?start=(\d+)&type=R"]
100 | 
101 |     def on_link(self):
102 |         books = self.response.re.many("https://book.douban.com/subject/\d+/")
103 |         return {
104 |             0: books,
105 |             4: self.parse_tag_urls()
106 |         }
107 | 


--------------------------------------------------------------------------------
/spiders/LaGou/README.md:
--------------------------------------------------------------------------------
 1 | ## 拉钩爬虫
 2 | 
 3 | ### 职位详情（JobDetails）
 4 | 
 5 | | 字段               | 示例                                         | 说明                  |
 6 | | ------------------ | -------------------------------------------- | --------------------- |
 7 | | result._id         | 5080106                                      | 职位id                |
 8 | | result.title       | 演员实习生                                   | 名称                  |
 9 | | result.label       | ['移动互联网', '广告营销']                   | 标签                  |
10 | | result.job_request | 2k-4k/上海 /经验应届毕业生 /大专及以上 /实习 | 要求                  |
11 | | result.advantage   | 周末双休,地铁周边,做五休二,氛围融洽          | 职位诱惑              |
12 | | result.job_bt      | 职位描述：岗位职责：1参与公司广告...         | 职位描述              |
13 | | result.work_addr   | 上海-徐汇区- 桂林路396号3号楼                | 工作地址              |
14 | | result.status      | 0                                            | 状态，0:进行中 1:结束 |
15 | | result.job_company | 乐推（上海）文化传播有限公司                 | 公司名字              |
16 | | result.type        | 移动互联网,广告营销领域                      | 类型                  |
17 | | result.time        | 2018-09-02                                   | 发布时间              |
18 | 
19 | ```python
20 | self.result = {
21 |     '_id': '5080106',
22 |     'title': '演员实习生', 
23 |     'label': ['移动互联网', '广告营销'],
24 |     'job_request': '2k-4k/上海 /经验应届毕业生 /大专及以上 /实习',
25 |     'advantage': '周末双休,地铁周边,做五休二,氛围融洽', 
26 |     'job_bt': '职位描述：岗位职责：1参与公司广告和短剧的拍摄；2负责公司项目前期筹备等的相关工作；3出演抖音广告与搞笑视频。任职要求：1长相甜美，外形清新亮丽，镜头感强，有强烈的表现力；2专科以上学历，表演系专业优先；3性格活泼开朗、思维活跃、为人正直；4工作态度积极；5仅仅招收女演员。',
27 |     'work_addr': '上海-徐汇区- 桂林路396号3号楼', 
28 |     'status': 0, 
29 |     'job_company': '乐推（上海）文化传播有限公司', 
30 |     'type': '移动互联网,广告营销领域', 
31 |     'time': '2018-09-02'
32 | }
33 | ```
34 | 
35 | 
36 | 
37 | ### 公司详情（GongSiDetails）
38 | 
39 | | 字段                     | 示例                       | 说明     |
40 | | ------------------------ | -------------------------- | -------- |
41 | | result._id               | 324                        | 公司id   |
42 | | result.company_abbr      | 爱立示                     | 简称     |
43 | | result.company_full_name | 慈溪爱立示信息科技有限公司 | 全称     |
44 | | result.type              | 信息安全,数据服务          | 类型     |
45 | | result.process           | 未融资                     | 融资状态 |
46 | | result.number            | 15-50人                    | 人数     |
47 | | result.address           | 北京                       | 公司地点 |
48 | | result.label             | ['技能培训', '岗位晋升']   | 公司标签 |
49 | | result.website           | http://www.alstru.com      | 公司网站 |
50 | 
51 | ```python
52 | self.result = {
53 |     '_id': '324', 
54 |     'company_abbr': '爱立示',
55 |     'company_full_name': '慈溪爱立示信息科技有限公司',
56 |     'type': '信息安全,数据服务',
57 |     'process': '未融资', 
58 |     'number': '15-50人',
59 |     'address': '北京',
60 |     'label': ['技能培训', '岗位晋升', '扁平管理', '领导好', '五险一金', '弹性工作'],
61 |     'website': 'http://www.alstru.com'
62 | }
63 | ```
64 | 
65 | 


--------------------------------------------------------------------------------
/spiders/LaGou/__init__.py:
--------------------------------------------------------------------------------
1 | import time
2 | from .tasks import *
3 | 
4 | 
5 | def reactor():
6 |     return time.time()
7 | 


--------------------------------------------------------------------------------
/spiders/LaGou/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false"
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = 3600
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/LaGou/tasks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import uuid
  4 | import random
  5 | import string
  6 | import datetime
  7 | from pyloom import tasks, errors
  8 | 
  9 | 
 10 | class LaGouJobTask(tasks.Task):
 11 |     @staticmethod
 12 |     def get_random(const):
 13 |         return "".join(random.sample(string.ascii_letters + string.digits, const))
 14 | 
 15 |     @staticmethod
 16 |     def get_uuid():
 17 |         return time.strftime("%Y%m%d%H%M%S-", time.localtime()) + str(uuid.uuid1())
 18 | 
 19 |     def get_cookies(self):
 20 |         cookies = {
 21 |             'LGUID': self.get_uuid(),
 22 |             'user_trace_token': '20180705084851-8a154ee4-0f2b-406d-9130-e835805b49ee',
 23 |             'X_HTTP_TOKEN': 'c2e6c0237f5362aca8d13748cfdd8274',
 24 |             'JSESSIONID': self.get_random(47).upper(),
 25 |             'SEARCH_ID': self.get_random(32).lower(),
 26 |             'LGSID': self.get_uuid(),
 27 |             'PRE_UTM': '',
 28 |             'PRE_HOST': '',
 29 |             'PRE_SITE': '',
 30 |             'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com',
 31 |             'LGRID': self.get_uuid()
 32 |         }
 33 |         return cookies
 34 | 
 35 |     @tasks.retry(5, 0)
 36 |     def on_download(self):
 37 |         """下载页面"""
 38 |         if self.buckets.local.get('cookies') is None:
 39 |             self.buckets.local.set('cookies', self.get_cookies())
 40 |         cookies = self.buckets.local.get('cookies')
 41 |         try:
 42 |             response = self.client.get(
 43 |                 url=self.url,
 44 |                 allow_redirects=False,
 45 |                 headers={
 46 |                     "User-Agent": self.ua.chrome,
 47 |                     "Accept-Encoding": "gzip",
 48 |                     "Host": "www.lagou.com",
 49 |                     "Referer": "https://www.lagou.com/jobs/list_"
 50 |                 },
 51 |                 cookies=cookies
 52 |             )
 53 |         except errors.ProxyError:
 54 |             self.logger.info("代理错误")
 55 |             raise errors.RetryError
 56 |         except errors.RequestError:
 57 |             self.logger.info("请求错误")
 58 |             raise errors.RetryError
 59 | 
 60 |         if response.status_code == 301:
 61 |             # 页面被删除
 62 |             raise errors.TaskFinish
 63 |         elif response.status_code == 302:
 64 |             self.logger.info(f"网页被封")
 65 |             self.buckets.local.set('cookies', self.get_cookies())
 66 |             self.queue.freeze(5)
 67 |             raise errors.RetryError
 68 |         elif "页面加载中" in response.text or "错误网关" in response.text:
 69 |             raise errors.RetryError
 70 |         else:
 71 |             return response
 72 | 
 73 | 
 74 | class JobDetails(LaGouJobTask):
 75 |     """职位详情页面"""
 76 |     filters = "https://www.lagou.com/jobs/(\d+).html"
 77 | 
 78 |     def on_parse(self):
 79 |         """提取数据"""
 80 |         try:
 81 |             publish_time = self.response.css.one(".publish_time").text()[0:-8]
 82 |         except errors.TaskError as e:
 83 |             return
 84 | 
 85 |         if re.match("(\d+):(\d+)", publish_time) is not None:
 86 |             publish_time = time.strftime("%Y-%m-%d", time.localtime())
 87 |         elif re.match("(\d+)天前", publish_time):
 88 |             publish_time = (datetime.date.today() -
 89 |                             datetime.timedelta(days=int(publish_time[0]))).strftime('%Y-%m-%d')
 90 |         status = 0 if self.response.css.one(".send-CV-btn").text() == "投个简历" else 1
 91 |         result = {
 92 |             "_id": re.search("(\d+)", self.url).group(0),
 93 |             "title": self.response.css.one(".job-name > .name").text(),
 94 |             "label": [label.text() for label in self.response.css.many(".labels")],
 95 |             "job_request": "".join(request.text() for request in self.response.css.many(".job_request > p > span")),
 96 |             "advantage": self.response.css.one(".job-advantage > p").text(),
 97 |             "job_bt": self.response.css.one(".job_bt").text(),
 98 |             "work_addr": self.response.css.one(".work_addr").text()[0:-8],
 99 |             "status": status,
100 |             "job_company": self.response.css.one("#job_company > dt > a > img").attrs["alt"],
101 |             "type": self.response.css.one(".c_feature > li").text(),
102 |             "time": publish_time
103 |         }
104 |         return result
105 | 
106 |     def on_link(self):
107 |         """提取链接"""
108 |         job_urls = []
109 |         max_id = self.buckets.share.get("max_id") or 4913130  # 爬虫的最大URL
110 |         use_id = self.buckets.share.get("use_id") or -1  # 当前使用的最大URL
111 |         waiting_url_const = self.queue.detail["waiting"][1]  # 当前职位等待队列中的URL数量
112 | 
113 |         if use_id >= max_id:
114 |             self.queue.interval = 3600
115 | 
116 |         if waiting_url_const <= 2000:
117 |             # 当等待队列中的URL数量少于1000时，添加URL到等待队列中
118 |             start_id = use_id + 1
119 |             end_id = use_id + 2000 if (use_id + 2000) < max_id else max_id
120 |             for path in range(start_id, end_id):
121 |                 job_urls.append(f"https://www.lagou.com/jobs/{path}.html")
122 |             self.buckets.share.set("use_id", use_id + 2000)
123 | 
124 |         gongsi_urls = [self.response.css.one("#job_company > dt > a").attrs.get("href", None)]
125 |         return {
126 |             0: gongsi_urls,
127 |             1: job_urls
128 |         }
129 | 
130 |     def on_save(self):
131 |         """保存数据"""
132 |         self.logger.info(f"抓到职位信息 {self.result}")
133 | 
134 | 
135 | class GongSiDetails(LaGouJobTask):
136 |     """公司页面详情信息"""
137 |     filters = "https://www.lagou.com/gongsi/(\d+).html"
138 | 
139 |     def on_parse(self):
140 |         result = {
141 |             "_id": re.search("(\d+)", self.url).group(0),
142 |             "company_abbr": self.response.css.one(".hovertips").text(),
143 |             "company_full_name": self.response.css.one(".hovertips").attrs["title"],
144 |             "type": self.response.css.one(".type + span").text(),
145 |             "process": self.response.css.one(".process + span").text(),
146 |             "number": self.response.css.one(".number + span").text(),
147 |             "address": self.response.css.one(".address + span").default(None).text(),
148 |             "label": [label.text() for label in self.response.css.many(".con_ul_li")],
149 |             "website": self.response.css.one(".hovertips").attrs.get("href", None)
150 |         }
151 |         return result
152 | 
153 |     def on_save(self):
154 |         self.logger.info(f"抓到公司信息 {self.result}")
155 | 
156 | 
157 | class JobsList(LaGouJobTask):
158 |     """工作列表，用于增量拉取职位信息"""
159 |     filters = "https://www.lagou.com/jobs/positionAjax.json?(\w+)"
160 | 
161 |     @tasks.retry(-1, 0)
162 |     def on_download(self):
163 |         try:
164 |             response = self.client.get(
165 |                 url=self.url,
166 |                 allow_redirects=False,
167 |                 headers={
168 |                     "User-Agent": self.ua.chrome,
169 |                     "DNT": "1",
170 |                     "Host": "www.lagou.com",
171 |                     "Origin": "https://www.lagou.com",
172 |                     "Referer": "https://www.lagou.com/jobs/list_",
173 |                     "X-Anit-Forge-Code": "0",
174 |                     "X-Anit-Forge-Token": None,
175 |                     "X-Requested-With": "XMLHttpRequest"
176 |                 }
177 |             )
178 |         except (errors.ProxyError, errors.RequestError):
179 |             raise errors.RetryError
180 | 
181 |         if response.json['success'] is False:
182 |             self.logger.error(f"列表页发现最新URL出现错误,速率过快")
183 |             raise errors.TaskBreak
184 |         else:
185 |             return response
186 | 
187 |     def on_parse(self):
188 |         """提取信息"""
189 |         old_max_id = self.buckets.share.get("max_id") or 0
190 |         new_max_id = self.response.json['content']['positionResult']['result'][0]['positionId']
191 |         if old_max_id < new_max_id:
192 |             self.buckets.share.set("max_id", new_max_id)
193 |             self.queue.interval = 0.01
194 |             return [f"https://www.lagou.com/jobs/{new_max_id}.html"]
195 |         else:
196 |             return []
197 | 
198 |     def on_link(self):
199 |         """提取链接"""
200 |         return {
201 |             1: self.result,
202 |             2: [f"https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false&T={time.time()}"]
203 |         }
204 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuo/README.md:
--------------------------------------------------------------------------------
  1 | ## 拼多多爬虫
  2 | 
  3 | #### 分类商品列表（OperationTask）
  4 | 
  5 | 搜索栏分类商品列表
  6 | 
  7 | | 字段                | 示例                                                         | 说明         |
  8 | | ------------------- | ------------------------------------------------------------ | ------------ |
  9 | | result.goods_id     | 2721076214                                                   | 商品id       |
 10 | | result.goods_name   | 【两件装】秋季男装长袖t恤...                                 | 商品名称     |
 11 | | result.thumb_url    | http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg | 商品图片链接 |
 12 | | result.cnt          | 545                                                          | 已售数量     |
 13 | | result.normal_price | 2500                                                         | 商品售价     |
 14 | | result.market_price | 9900                                                         | 商品标价     |
 15 | | result.price        | 1280                                                         | 商品拼团价   |
 16 | | result.updated_at   | 2018-09-02 13:58:08.176553                                   | 爬取时间     |
 17 | 
 18 | ```python
 19 | self.result = [
 20 |     {
 21 |         'goods_id': 2721076214,
 22 |         'goods_name': '【两件装】秋季男装长袖t恤青年韩版潮流上衣学生修身百搭打底衫',
 23 |         'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-09-01/75ed6981d7961404ba75f0b1f9dd6322.jpeg',
 24 |         'cnt': 545,
 25 |         'normal_price': 2500,
 26 |         'market_price': 9900,
 27 |         'price': 1280,
 28 |         'updated_at': '2018-09-02 13:58:08.176553'
 29 |     },
 30 |     {
 31 |         'goods_id': 142150779,
 32 |         'goods_name': '【花花公子贵宾】春夏秋款宽松直筒牛仔裤男弹力休闲商务大码长裤',
 33 |         'thumb_url': 'http://t00img.yangkeduo.com/goods/images/2018-08-14/a18025b1e91445e5ac2acb26be773cd1.jpeg',
 34 |         'cnt': 294167,
 35 |         'normal_price': 4990,
 36 |         'market_price': 29800,
 37 |         'price': 2990,
 38 |         'updated_at': '2018-09-02 13:58:08.176553'
 39 |     }
 40 |     ...
 41 | ]
 42 | ```
 43 | 
 44 | 
 45 | 
 46 | #### 商品详情（GoodsTask）
 47 | 
 48 | | 字段                | 示例                                 | 说明                         |
 49 | | ------------------- | ------------------------------------ | ---------------------------- |
 50 | | result.goods_sn     | "1805231480604761"                   | sn码                         |
 51 | | result.goods_id     | 1480604761                           | 商品id                       |
 52 | | result.cat_id       | 9813                                 | 搜索id                       |
 53 | | result.goods_name   | 【凡爱宝贝】3d立体墙贴自粘...        | 名称                         |
 54 | | result.goods_desc   | 【3d立体墙贴】【环保无味】...        | 简介                         |
 55 | | result.market_price | 3500                                 | 标价                         |
 56 | | result.is_onsale    | 1                                    | 是否在售，0:下架 1:出售      |
 57 | | result.thumb_url    | http://t00img.yangkeduo.com/...      | 商品图标                     |
 58 | | result.hd_thumb_url | http://t00img.yangkeduo.com/...      | 商品放大图标                 |
 59 | | result.image_url    | http://t00img.yangkeduo.com/...      | 商品图片链接                 |
 60 | | result.price        | {"min_on_sale_group_price": 358,...} | 商品价格,见[详情](#price)    |
 61 | | result.gallery      | [{"id": 34954263707,'url':...}]      | 商品详情介绍                 |
 62 | | result.created_at   | 1527069514                           | 创建时间戳                   |
 63 | | result.sales        | 167701                               | 销售量                       |
 64 | | result.cat_id_list  | [9316, 9402, 9813]                   | 商品多级分类                 |
 65 | | result.sku          | [{"sku_id": 33940681934,...}]        | 商品规格详情，见[详情](#sku) |
 66 | 
 67 | ```python
 68 | self.result = {
 69 |     "goods_sn": "1805231480604761", 
 70 |     "goods_id": 1480604761, 
 71 |     "cat_id": 9813, 
 72 |     "goods_name": "【凡爱宝贝】3d立体墙贴自粘防水墙纸防撞壁纸客厅卧室砖纹贴纸", 
 73 |     "goods_desc": "【3d立体墙贴】【环保无味】【无甲醛 免胶自粘】绿色环保、无毒、无味,免人工,带胶撕开底纸即可粘贴,产品粘性强,不易脱落,具有很好的防撞、防水、防潮效果,易遮盖污点,环保无异味,施工简单,规格:70cm宽X77cm高; 工厂直销,砖纹形,装饰儿童房、卧室、客厅背景墙、走廊,也可发挥想象自由裁剪DIY。【计算方式】长x宽=面积,总面积÷单片面积=片数,一片尺寸是70cm宽X77cm高=0.539平方【友情提示】为避免不够,建议需要尽量多买2片备着,因为不同批次颜色有可能存在差异,所以请亲们一次购买足够。", 
 74 |     "market_price": 3500, 
 75 |     "is_onsale": 1, 
 76 |     "thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/f9ba2f52be83d2f55142c55f44ec678c.jpeg", 
 77 |     "hd_thumb_url": "http://t00img.yangkeduo.com/goods/images/2018-08-20/8c5790dfea2422328ee3a487f3685ed6.jpeg", 
 78 |     "image_url": "http://t00img.yangkeduo.com/goods/images/2018-07-22/95065d45399fce770bb49de0fba5c590.jpeg", 
 79 |     "goods_type": 1, 
 80 |     "gallery": [
 81 |         {
 82 |             "id": 34954263707, 
 83 |             "url": "http://t00img.yangkeduo.com/t10img/images/2018-07-16/d864ed35818e90521cf858951d9dc349.jpeg"
 84 |         }
 85 |     ], 
 86 |     "created_at": 1527069514, 
 87 |     "sales": 167701, 
 88 |     "price": {
 89 |         "min_on_sale_group_price": 358, 
 90 |         "max_on_sale_group_price": 781, 
 91 |         "min_on_sale_normal_price": 490, 
 92 |         "max_on_sale_normal_price": 1500, 
 93 |         "min_group_price": 358, 
 94 |         "max_group_price": 781, 
 95 |         "max_normal_price": 1500, 
 96 |         "min_normal_price": 490, 
 97 |         "old_min_on_sale_group_price": 390, 
 98 |         "old_max_on_sale_group_price": 860, 
 99 |         "old_min_group_price": 390, 
100 |         "old_max_group_price": 860
101 |     }, 
102 |     "cat_id_list": [9316, 9402, 9813], 
103 |     "sku": [
104 |         {
105 |             "sku_id": 33940681934, 
106 |             "goods_id": 1480604761, 
107 |             "thumb_url": "http://t00img.yangkeduo.com/t07img/images/2018-07-12/94b7c9302b62c64e22914e6e36fb9d40.png", 
108 |             "quantity": 0, 
109 |             "normal_price": 1500, 
110 |             "group_price": 561, 
111 |             "old_group_price": 610, 
112 |             "specs": [
113 |                 {
114 |                     "spec_key": "尺寸", 
115 |                     "spec_value": "尺寸70*77厘米/1张"
116 |                 }, 
117 |                 {
118 |                     "spec_key": "颜色", 
119 |                     "spec_value": "特价白色（70*77厘米）"
120 |                 }
121 |             ]
122 |         }
123 |     ]
124 | }
125 | ```
126 | 
127 | + cat_id_list为商品的多级分类栏id，依次为商品一级分类、商品二级分类、商品三级分类
128 | 
129 | ### 附录
130 | 
131 | #### price
132 | 
133 | | 值                          | 含义                   |
134 | | --------------------------- | ---------------------- |
135 | | min_on_sale_group_price     | 在售商品团购最低价     |
136 | | max_on_sale_group_price     | 在售商品团购最高价     |
137 | | min_on_sale_normal_price    | 在售商品最低价         |
138 | | max_on_sale_normal_price    | 在售商品最高价         |
139 | | min_group_price             | 商品团购最低价         |
140 | | max_group_price             | 商品团购最高价         |
141 | | max_normal_price            | 商品最高价             |
142 | | min_normal_price            | 商品最低价             |
143 | | old_min_on_sale_group_price | 在售商品团购旧的最低价 |
144 | | old_max_on_sale_group_price | 在售商品团购旧的最高价 |
145 | | old_min_group_price         | 商品团购旧的最低价     |
146 | | old_max_group_price         | 商品团购旧的最高价     |
147 | 
148 | #### sku
149 | 
150 | | 值               | 含义         |
151 | | ---------------- | ------------ |
152 | | sku_id           | 规格id       |
153 | | goods_id         | 商品id       |
154 | | thumb_url        | 规格图片链接 |
155 | | quantity         | 数据         |
156 | | normal_price     | 标价         |
157 | | group_price      | 团购价       |
158 | | old_group_price  | 旧的团购价   |
159 | | specs.spec_key   | 规格参数     |
160 | | specs.spec_value | 规格参数值   |
161 | 
162 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuo/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuo/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations?pdduid="
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = 0
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuo/tasks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import random
  3 | import string
  4 | import datetime
  5 | from pyloom import tasks
  6 | from pyloom.errors import *
  7 | 
  8 | 
  9 | def get_list_id(opt_id):
 10 |     """返回list_id:(opt_id)_(10位随机字符串)"""
 11 |     return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10))
 12 | 
 13 | 
 14 | class PinDuoDuoTask(tasks.Task):
 15 |     """搜索栏"""
 16 |     _redis = None
 17 |     goods_url = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/"
 18 |     operation_url = "http://apiv4.yangkeduo.com/v4/operation/"
 19 | 
 20 |     @tasks.retry(tries=5, backoff=0)
 21 |     def on_download(self):
 22 |         """下载链接"""
 23 |         try:
 24 |             resp = self.client.get(
 25 |                 url=self.url,
 26 |                 headers={
 27 |                     "User-Agent": self.ua.android,
 28 |                     "Referer": "Android",
 29 |                     "Host": "apiv4.yangkeduo.com"
 30 |                 }
 31 |             )
 32 |         except ProxyError:
 33 |             self.client.reload_proxy()
 34 |             raise RetryError
 35 |         except RequestError:
 36 |             raise RetryError
 37 | 
 38 |         try:
 39 |             if "error_code" in resp.json:
 40 |                 error_code = resp.json.get('error_code', None)
 41 |             else:
 42 |                 error_code = None
 43 |         except JSONDecodeError:
 44 |             error_code = None
 45 | 
 46 |         if error_code == 40001 or resp.status_code == 503 or resp.status_code == 504:
 47 |             self.client.reuse_proxy()
 48 |             raise RetryError
 49 | 
 50 |         if resp.status_code == 403 or resp.status_code == 429:
 51 |             self.client.reload_proxy()
 52 |             raise RetryError
 53 |         else:
 54 |             self.client.reuse_proxy()
 55 |             return resp
 56 | 
 57 | 
 58 | class HomeOperationTask(PinDuoDuoTask):
 59 |     """搜索栏"""
 60 |     filters = "http://apiv4.yangkeduo.com/api/fiora/v2/home_operations\?pdduid="
 61 | 
 62 |     def on_parse(self):
 63 |         targets = []
 64 |         for childrens in self.response.json:
 65 |             targets.append(childrens["id"])
 66 |             for children in childrens["children"]:
 67 |                 targets.append(children["id"])
 68 |         return targets
 69 | 
 70 |     def on_link(self):
 71 |         return {
 72 |             4: [f"{self.operation_url}{opt_id}/groups?opt_type=2&size=50&offset=0&list_id={get_list_id(opt_id)}&pdduid="
 73 |                 for opt_id in self.result]
 74 |         }
 75 | 
 76 | 
 77 | class OperationTask(PinDuoDuoTask):
 78 |     """分类商品结果"""
 79 |     filters = "http://apiv4.yangkeduo.com/v4/operation/(\w+)"
 80 | 
 81 |     def on_parse(self):
 82 |         goods = []
 83 |         for good in self.response.json["goods_list"]:
 84 |             goods.append(
 85 |                 {
 86 |                     "goods_id": good["goods_id"],
 87 |                     "goods_name": good["goods_name"],
 88 |                     "thumb_url": good["thumb_url"],
 89 |                     "cnt": good["cnt"],
 90 |                     "normal_price": good["normal_price"],
 91 |                     "market_price": good["market_price"],
 92 |                     "price": good["group"]["price"],
 93 |                     "updated_at": str(datetime.datetime.now())
 94 |                 }
 95 |             )
 96 |         operation = {
 97 |             "goods_id": [good["goods_id"] for good in goods],
 98 |             "opt_infos": self.response.json["opt_infos"],
 99 |             "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1],
100 |             "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1],
101 |             "flip": self.response.json["flip"],
102 |             "next_offset": str(self.response.json["flip"]).split(";")[-1]
103 |         }
104 |         return goods, operation
105 | 
106 |     def on_link(self):
107 |         goods, operation = self.result
108 | 
109 |         goods_list = [f"{self.goods_url}{goods_id}?goods_id={goods_id}&from=0&pdduid="
110 |                       for goods_id in operation["goods_id"]]
111 |         operation_list = [f'{self.operation_url}{opt_infos["id"]}/groups?opt_type=2&size=50&offset=0'
112 |                           f'&list_id={get_list_id(opt_infos["id"])}&pdduid='
113 |                           for opt_infos in operation["opt_infos"]]
114 | 
115 |         if operation["flip"] is not None:
116 |             operation_list.append(f'{self.operation_url}{operation["opt_id"]}/groups?opt_type=2&size=50&offset='
117 |                                   f'{operation["next_offset"]}&list_id={operation["list_id"]}'
118 |                                   f'&flip={operation["flip"]}&pdduid=')
119 |         return {
120 |             2: goods_list,
121 |             4: operation_list
122 |         }
123 | 
124 |     def on_save(self):
125 |         self.logger.info(f'抓到商品列表 {self.result[0]}')
126 | 
127 | 
128 | class GoodsTask(PinDuoDuoTask):
129 |     """商品详情接口"""
130 |     filters = "http://apiv4.yangkeduo.com/api/oakstc/v14/goods/(\w+)"
131 | 
132 |     def on_parse(self):
133 |         _sku = self.response.json["sku"]
134 |         goods_info = {
135 |             "goods_sn": self.response.json["goods_sn"],
136 |             "goods_id": self.response.json["goods_id"],
137 |             "cat_id": self.response.json["cat_id"],
138 |             "goods_name": self.response.json["goods_name"],
139 |             "goods_desc": self.response.json["goods_desc"],
140 |             "market_price": self.response.json["market_price"],
141 |             "is_onsale": self.response.json["is_onsale"],
142 |             "thumb_url": self.response.json["thumb_url"],
143 |             "hd_thumb_url": self.response.json["hd_thumb_url"],
144 |             "image_url": self.response.json["image_url"],
145 |             "goods_type": self.response.json["goods_type"],
146 |             "gallery": [{"id": gallery["id"], "url":gallery["url"]} for gallery in self.response.json["gallery"]],
147 |             "created_at": self.response.json["created_at"],
148 |             "sales": self.response.json["sales"],
149 |             "price": {
150 |                 "min_on_sale_group_price": self.response.json["min_on_sale_group_price"],
151 |                 "max_on_sale_group_price": self.response.json["max_on_sale_group_price"],
152 |                 "min_on_sale_normal_price": self.response.json["min_on_sale_normal_price"],
153 |                 "max_on_sale_normal_price": self.response.json["max_on_sale_normal_price"],
154 |                 "min_group_price": self.response.json["min_group_price"],
155 |                 "max_group_price": self.response.json["max_group_price"],
156 |                 "max_normal_price": self.response.json["max_normal_price"],
157 |                 "min_normal_price": self.response.json["min_normal_price"],
158 |                 "old_min_on_sale_group_price": self.response.json["old_min_on_sale_group_price"],
159 |                 "old_max_on_sale_group_price": self.response.json["old_max_on_sale_group_price"],
160 |                 "old_min_group_price": self.response.json["old_min_group_price"],
161 |                 "old_max_group_price": self.response.json["old_max_group_price"]
162 |             },
163 |             "cat_id_list": [self.response.json["cat_id_1"],
164 |                             self.response.json["cat_id_2"],
165 |                             self.response.json["cat_id_3"]]
166 |         }
167 |         sku = []
168 |         for sku_list in _sku:
169 |             sku.append({
170 |                 "sku_id": sku_list["sku_id"],
171 |                 "goods_id": sku_list["goods_id"],
172 |                 "thumb_url": sku_list["thumb_url"],
173 |                 "quantity": sku_list["quantity"],
174 |                 "normal_price": sku_list["normal_price"],
175 |                 "group_price": sku_list["group_price"],
176 |                 "old_group_price": sku_list["old_group_price"],
177 |                 "specs": sku_list["specs"]
178 |             })
179 |         goods_info["sku"] = sku
180 |         return goods_info
181 | 
182 |     def on_save(self):
183 |         self.logger.info(f'抓到商品信息 {self.result}')
184 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/README.md:
--------------------------------------------------------------------------------
  1 | ## 拼多多爬虫网页版
  2 | 
  3 | #### 分类商品列表（ListTask）
  4 | 搜索栏分类商品列表
  5 | 
  6 | | 字段                | 示例                                                         | 说明         |
  7 | | ------------------- | ------------------------------------------------------------ | ------------ |
  8 | | result.thumb_url    | https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg | 商品图片链接 |
  9 | | result.country     |                                                    | 国度       |
 10 | | result.goods_name   | vivo原装耳机x21 x20...                                 | 商品名称     |
 11 | | result.short_name   | vivo原装耳机x21 x20...                                 | 商品简称    |
 12 | | result.sales_tip  | 已拼1490件                                 | 商品销售提示    |
 13 | | result.goods_id     | 6636323997                                                   | 商品id       |
 14 | | result.cnt          | 545                                                          | 已售数量     |
 15 | | result.normal_price | 2500                                                         | 商品售价     |
 16 | | result.market_price | 9900                                                         | 商品标价     |
 17 | | result.price        | 1280                                                         | 商品拼团价   |
 18 | | result.link_url        |  goods.html?goods_id=6636323997&gallery_id=103375816423         | 商品详情连接   |
 19 | | result.mall_name        |           | 商品店铺名称   |
 20 | | result.tag        |    ['极速退款']       | 商品标签   |
 21 | | result.updated_at   | 2018-09-02 13:58:08.176553                                   | 爬取时间     |
 22 | 
 23 | ```python
 24 | self.result = [
 25 |     {
 26 |         'thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg',
 27 |         'country': '', 
 28 |         'goods_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话',
 29 |         'short_name': 'vivo原装耳机x21 x20 x9 x7 x6 y67 y66 y37 y27线控带麦可通话',
 30 |         'sales_tip': '已拼1490件',
 31 |         'cnt': 1490,
 32 |         'goods_id': 6636323997,
 33 |         'hd_thumb_url': 'https://t00img.yangkeduo.com/goods/images/2019-03-17/075bf350-0b66-4dfe-99e4-98eb912ac158.jpg',
 34 |         'hd_url': '',
 35 |         'normal_price': 1480,
 36 |         'market_price': 9900,
 37 |         'price': 1280,
 38 |         'link_url': 'goods.html?goods_id=6636323997&gallery_id=103375816423',
 39 |         'mall_name': None,
 40 |         'tag': ['极速退款'],
 41 |         'updated_at': '2019-04-15 23:05:57.603136'
 42 |     },
 43 |     ...
 44 | ]
 45 | ```
 46 | 
 47 | ### 商品详情（GoodsTask
 48 | 
 49 | ```python
 50 | self.result = {
 51 |     'goods': {
 52 |         'serverTime': 1555340763,
 53 |         'serverTimeTen': 15553407630,
 54 |         'allowedRegions': '2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,22,23,24,25,26,27,30,31,32',
 55 |         'catID': 5794,
 56 |         'country': '',
 57 |         'warehouse': '',
 58 |         'goodsDesc': '如果你还再为佩戴迷你型双耳容...',
 59 |         'goodsID': 2058994703,
 60 |         'goodsName': '防水超小无线蓝牙耳机双耳5.0跑步运动一对迷你vivo入耳oppo耳机',
 61 |         'shareDesc': '如果你还再为佩戴迷你型双耳容...',
 62 |         'goodsType': 1,
 63 |         'localGroups': [],
 64 |         'hasLocalGroup': 1,
 65 |         'bannerHeight': 375,
 66 |         'topGallery': [
 67 |             '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/strip%7CimageView2/2/w/1300/q/80',
 68 |             ...
 69 |             ], 
 70 |         'viewImageData': [
 71 |             '//t00img.yangkeduo.com/goods/images/2019-02-21/3852f3ef-500c-4590-adf9-356a3397b0ce.jpg?imageMogr2/quality/70',
 72 |             ...
 73 |         ],
 74 |         'detailGallery': [
 75 |             {'url': '//t00img.yangkeduo.com/t09img/images/2018-07-03/84f1bc3741b3182df6f9d4dae633c9ec.jpeg?imageMogr2/quality/70', 'width': 790, 'height': 790}, 
 76 |             ...
 77 |         ], 
 78 |         'videoGallery': [], 
 79 |         'hasLiveGallery': False, 
 80 |         'descVideoGallery': [], 
 81 |         'mallID': 17984, 
 82 |         'groupTypes': [
 83 |             {'requireNum': '1', 'price': '0', 'totalPrice': '0', 'groupID': 2960548556, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999},
 84 |             {'requireNum': '2', 'price': '0', 'totalPrice': '0', 'groupID': 2960548557, 'startTime': 1451577600, 'endTime': 2082729600, 'orderLimit': 999999}
 85 |         ], 
 86 |         'skus': [
 87 |             {
 88 |                 'skuID': 38010790017,
 89 |                 'quantity': 158,
 90 |                 'initQuantity': 0,
 91 |                 'isOnSale': 1,
 92 |                 'soldQuantity': 0,
 93 |                 'specs': [
 94 |                     {'spec_key': '颜色', 'spec_value': '黑色-支持双耳通话', 'spec_key_id': 1215, 'spec_value_id': 843019793}
 95 |                 ],
 96 |                 'thumbUrl': '//t00img.yangkeduo.com/t09img/images/2018-07-03/047554b0d2cd49183b5b2ed2380c528a.jpeg',
 97 |                 'limitQuantity': 999999,
 98 |                 'normalPrice': '218',
 99 |                 'groupPrice': '162',
100 |                 'oldGroupPrice': '198',
101 |                 'skuExpansionPrice': '0',
102 |                 'unselectGroupPrice': '0'
103 |             },
104 |             ...
105 |         ],
106 |         'thumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/a5d77844f14e6438fe196b0d08fd9c63.jpeg',
107 |         'hdThumbUrl': '//t00img.yangkeduo.com/goods/images/2019-02-21/7869f190bfd614a9d3151316f02642a1.jpeg', 
108 |         'eventType': 0,
109 |         'isOnSale': True,
110 |         'isGoodsOnSale': True,
111 |         'isSkuOnSale': True,
112 |         'freeCoupon': [],
113 |         'isApp': 0,
114 |         'isFreshmanApp': 0,
115 |         'sideSalesTip': '已拼47件',
116 |         'bottomSalesTip': '',
117 |         'hasAddress': False, 
118 |         'catID1': 5752, 
119 |         'catID2': 5793, 
120 |         'catID3': 5794, 
121 |         'catID4': 0, 
122 |         'eventComing': False, 
123 |         'isMutiGroup': False, 
124 |         'isNewUserGroup': False,
125 |         'isSpike': False,
126 |         'isTodaySpike': False,
127 |         'isTomorrowSpike': False, 
128 |         'activity': {
129 |             'activityID': 11464199, 
130 |             'activityType': 101, 
131 |             'startTime': 1554825600,
132 |             'endTime': 1555775999
133 |         },
134 |         'isGroupFree': False,
135 |         'isSpikeComing': False,
136 |         'overseaType': 0,
137 |         'isHaitao': False,
138 |         'isAppNewerJoinGroup': False,
139 |         'countryLogo': '',
140 |         'gpv': None,
141 |         'quickRefund': False,
142 |         'rv': True,
143 |         'maxNormalPrice': '218',
144 |         'minNormalPrice': '218',
145 |         'maxGroupPrice': '162',
146 |         'minGroupPrice': '162',
147 |         'maxOnSaleGroupPrice': '162',
148 |         'minOnSaleGroupPrice': '162', 
149 |         'maxOnSaleGroupPriceInCent': 16200, 
150 |         'minOnSaleGroupPriceInCent': 16200, 
151 |         'maxOnSaleNormalPrice': '218',
152 |         'minOnSaleNormalPrice': '218', 
153 |         'minTotalGroupPrice': '324', 
154 |         'oldMinOnSaleGroupPriceInCent': 19800, 
155 |         'unselectMinGroupPrice': '0', 
156 |         'unselectMaxGroupPrice': '0', 
157 |         'skipGoodsIDs': ['0'],
158 |         'tag': -1,
159 |         'icon': {'id': 5, 'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/21bdb71af69e346fc73098a23e808656.png', 'width': 116, 'height': 45}, 
160 |         'tagIcon': [], 
161 |         'isSecondHand': 0, 
162 |         'promotionBanner': {
163 |             'id': 1, 
164 |             'url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 
165 |             'default_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 
166 |             'new_url': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 
167 |             'url_v2': 'http://t13img.yangkeduo.com/cart/2019-04-03/77c14365ebf58c55a06f0e78fc017859.jpeg', 
168 |             'url_v2_h': 96, 
169 |             'url_v2_w': 750, 
170 |             'serverTime': 1555340763
171 |         }, 
172 |         'isMallDsr': 1, 
173 |         'hasPromotion': 1, 
174 |         'appClientOnly': 0, 
175 |         'isColdGoods': 1, 
176 |         'singleCardStatus': 0, 
177 |         'singleCardCount': 0, 
178 |         'goodsProperty': [
179 |             {'key': '佩戴方式', 'values': ['入耳式']},
180 |             ...
181 |         ],
182 |         ...
183 |     }
184 | ```
185 | 
186 | get_anticontent.js来自https://github.com/SergioJune/Spider-Crack-JS/blob/master/pinduoduo/get_anticontent.js


--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *


--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "https://mobile.yangkeduo.com/classification.html"
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = 0
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/PinDuoDuoWEB/tasks.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import execjs
  4 | import string
  5 | import datetime
  6 | import re
  7 | import os
  8 | from pyloom import tasks
  9 | from pyloom.errors import *
 10 | 
 11 | 
 12 | def get_list_id(opt_id):
 13 |     """返回list_id:(opt_id)_(10位随机字符串)"""
 14 |     return str(opt_id) + "_" + "".join(random.sample(string.ascii_letters + string.digits, 10))
 15 | 
 16 | 
 17 | def get_anti_content(ua):
 18 |     path = os.path.abspath(os.path.dirname(__file__))
 19 |     with open(os.path.join(path, 'get_anticontent.js'), 'r', encoding='utf-8') as f:
 20 |         js = f.read()
 21 |     ctx = execjs.compile(js)
 22 |     url = "https://mobile.yangkeduo.com/catgoods.html"
 23 |     return ctx.call('get_anti', url, ua)
 24 | 
 25 | 
 26 | class SearchTask(tasks.Task):
 27 |     filters = "https://mobile.yangkeduo.com/classification.html"
 28 | 
 29 |     @tasks.retry()
 30 |     def on_download(self):
 31 |         try:
 32 |             ua = self.ua.chrome  # 随机获取ua
 33 |             resp = self.client.get(
 34 |                 url=self.url,
 35 |                 headers={
 36 |                     "User-Agent": ua,
 37 |                     'authority': 'mobile.yangkeduo.com',
 38 |                     'pragma': 'no-cache',
 39 |                     'cache-control': 'no-cache',
 40 |                     'upgrade-insecure-requests': '1',
 41 |                     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
 42 |                               'application/signed-exchange;v=b3',
 43 |                     'accept-encoding': 'gzip, deflate, br',
 44 |                     'accept-language': 'zh-CN,zh;q=0.9'
 45 |                 }
 46 |             )
 47 |         except ProxyError:
 48 |             self.client.reload_proxy()
 49 |             raise RetryError
 50 |         except RequestError:
 51 |             raise RetryError
 52 |         return resp
 53 | 
 54 |     def on_parse(self):
 55 |         data = json.loads(self.response.re.many("__NEXT_DATA__.*?__NEXT_LOADED_PAGES")[0][16:-20])
 56 |         result = []
 57 |         for i in data['props']['pageProps']['data']['operationsData']['detailData']:
 58 |             for j in i['cat']:
 59 |                 result.append(f'https://mobile.yangkeduo.com/proxy/api/v4/operation/{j["optID"]}/groups'
 60 |                               f'?offset=0&size=100&opt_type=2&sort_type=DEFAULT&list_id={get_list_id(j["optID"])}'
 61 |                               f'&pdduid=0')
 62 |         return result
 63 | 
 64 |     def on_link(self):
 65 |         """解析url,并添加到队列"""
 66 |         return {
 67 |             4: self.result
 68 |         }
 69 | 
 70 | 
 71 | class ListTask(tasks.Task):
 72 |     filters = "https://mobile.yangkeduo.com/proxy/api/v4/operation/(\w+)"
 73 | 
 74 |     @tasks.retry()
 75 |     def on_download(self):
 76 |         try:
 77 |             ua = self.ua.chrome  # 随机获取ua
 78 |             url = self.url + f"&anti_content={get_anti_content(ua)}"
 79 |             resp = self.client.get(
 80 |                 url=url,
 81 |                 headers={
 82 |                     "User-Agent": ua,
 83 |                     'authority': 'mobile.yangkeduo.com',
 84 |                     'pragma': 'no-cache',
 85 |                     'cache-control': 'no-cache',
 86 |                     'upgrade-insecure-requests': '1',
 87 |                     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
 88 |                               'application/signed-exchange;v=b3',
 89 |                     'accept-encoding': 'gzip, deflate, br',
 90 |                     'accept-language': 'zh-CN,zh;q=0.9'
 91 |                 }
 92 |             )
 93 |         except ProxyError:
 94 |             self.client.reload_proxy()
 95 |             raise RetryError
 96 |         except RequestError:
 97 |             raise RetryError
 98 | 
 99 |         return resp
100 | 
101 |     def on_parse(self):
102 |         goods = []
103 |         for good in self.response.json["goods_list"]:
104 |             goods.append(
105 |                 {
106 |                     "thumb_url": good["thumb_url"],
107 |                     "country": good["country"],
108 |                     "goods_name": good["goods_name"],
109 |                     "short_name": good["short_name"],
110 |                     "sales_tip": good["sales_tip"],
111 |                     "cnt": good["cnt"],
112 |                     "goods_id": good["goods_id"],
113 |                     "hd_thumb_url": good["hd_thumb_url"],
114 |                     "hd_url": good["hd_url"],
115 |                     "normal_price": good["normal_price"],
116 |                     "market_price": good["market_price"],
117 |                     "price": good["group"]["price"],
118 |                     "link_url": good["link_url"],
119 |                     "mall_name": good.get('mall_name'),
120 |                     "tag": [i["text"] for i in good["tag_list"]],
121 |                     "updated_at": str(datetime.datetime.now())
122 |                 }
123 |             )
124 |         operation = {
125 |             "link_url": [good["link_url"] for good in goods],
126 |             "opt_infos": self.response.json["opt_infos"],
127 |             "opt_id": re.search(r'operation/(\d+)/groups', self.url).group(0).split("/")[1],
128 |             "list_id": re.search(r'&list_id=(\d+)_(\w+)', self.url).group(0).split("=")[1],
129 |             "flip": self.response.json["flip"],
130 |             "next_offset": str(self.response.json["flip"]).split(";")[0]
131 |         }
132 |         return goods, operation
133 | 
134 |     def on_link(self):
135 |         url = "https://mobile.yangkeduo.com/"
136 | 
137 |         goods, operation = self.result
138 |         goods_list = [f'{url}{link_url}' for link_url in operation["link_url"]]
139 |         operation_list = [f'{url}/proxy/api/v4/operation/{opt["id"]}/groups?offset=0&size=100&opt_type=2'
140 |                           f'&sort_type=DEFAULT&list_id={get_list_id(opt["id"])}&pdduid=0'
141 |                           for opt in operation["opt_infos"]]
142 |         if operation["flip"] is not None:
143 |             operation_list.append(f'{url}/proxy/api/v4/operation/{operation["opt_id"]}/groups?opt_type=2&size=100'
144 |                                   f'&offset={operation["next_offset"]}&list_id={operation["list_id"]}'
145 |                                   f'&flip={operation["flip"]}&pdduid=0')
146 |         self.logger.debug(goods_list)
147 |         self.logger.debug(operation_list)
148 |         return {
149 |             1: goods_list,
150 |             4: operation_list
151 |         }
152 | 
153 |     def on_save(self):
154 |         self.logger.debug(self.result[0])
155 | 
156 | 
157 | class GoodsTask(tasks.Task):
158 |     filters = "https://mobile.yangkeduo.com/goods.html"
159 | 
160 |     @tasks.retry()
161 |     def on_download(self):
162 |         try:
163 |             resp = self.client.get(
164 |                 url=self.url,
165 |                 headers={
166 |                     "User-Agent": self.ua.chrome,
167 |                     'authority': 'mobile.yangkeduo.com',
168 |                     'pragma': 'no-cache',
169 |                     'cache-control': 'no-cache',
170 |                     'upgrade-insecure-requests': '1',
171 |                     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
172 |                               'application/signed-exchange;v=b3',
173 |                     'accept-encoding': 'gzip, deflate, br',
174 |                     'accept-language': 'zh-CN,zh;q=0.9'
175 |                 }
176 |             )
177 |         except ProxyError:
178 |             self.client.reload_proxy()
179 |             raise RetryError
180 |         except RequestError:
181 |             raise RetryError
182 |         self.logger.debug(resp.status_code)
183 | 
184 |         if re.search('"initDataObj":{"needLogin":true}', resp.text) is not None:
185 |             raise RetryError
186 |         return resp
187 | 
188 |     def on_parse(self):
189 |         data = json.loads(self.response.re.many("window.rawData=.*?}};")[0][15:-1])
190 |         return {
191 |             "goods": data["store"]["initDataObj"]["goods"],
192 |             "mall": data["store"]["initDataObj"]["mall"],
193 |             "reviews": data["store"]["initDataObj"]["reviews"],
194 |         }
195 | 


--------------------------------------------------------------------------------
/spiders/WeiBo/README.md:
--------------------------------------------------------------------------------
  1 | ## 微博爬虫
  2 | 
  3 | ### 用户信息（UserTask）
  4 | 
  5 | | 字段                     | 示例                               | 说明                               |
  6 | | ------------------------ | ---------------------------------- | ---------------------------------- |
  7 | | result.uid               | 1680938527                         | 用户唯一标识                       |
  8 | | result.screen_name       | 作恶太妖精                         | 用户昵称                           |
  9 | | result.statuses_count    | 12275                              | 微博数量                           |
 10 | | result.verified_type     | -1                                 | 账号类型，见[附录](#verified_type) |
 11 | | result.verified_type_ext | -1                                 | 附加账号类型，-1:无 1:橙V 0:金V    |
 12 | | result.description       | 因为追求梦想而伟大！梦想是熬出来的 | 简介                               |
 13 | | result.gender            | f                                  | 性别，f:女 m:男                    |
 14 | | result.mbtype            | 0                                  | 未知                               |
 15 | | result.urank             | 35                                 | 账号等级                           |
 16 | | result.mbrank            | 0                                  | 会员等级                           |
 17 | | result.followers_count   | 754                                | 粉丝数量                           |
 18 | | result.follow_count      | 602                                | 关注数量                           |
 19 | | result.profile_image_id  | 6431161fjw1e8qgp5bmzyj2050050aa8   | 头像图片号                         |
 20 | | result.status            | 0                                  | 账号状态，-1:不可用 0:可用         |
 21 | | result.updated_at        | 2018-08-10 00:02:02                | 抓取时间                           |
 22 | 
 23 | ```python
 24 | self.result = {
 25 |     'uid': 2554193671, 
 26 |     'screen_name': '黑镜头世界', 
 27 |     'statuses_count': 88, 
 28 |     'verified_type': -1, 
 29 |     'verified_type_ext': -1, 
 30 |     'description': '一张残旧的老照片，能给你带来灌顶的震撼~', 
 31 |     'gender': 'm', 
 32 |     'mbtype': 0, 
 33 |     'urank': 2, 
 34 |     'mbrank': 0, 
 35 |     'followers_count': 84, 
 36 |     'follow_count': 4, 
 37 |     'profile_image_id': '983de707jw1e8qgp5bmzyj2050050aa8', 
 38 |     'status': 0, 
 39 |     'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 10, 231390)
 40 | }
 41 | ```
 42 | 
 43 | 
 44 | 
 45 | ### 原创微博（UserTask）
 46 | 
 47 | 每个用户前10条微博中的原创微博
 48 | 
 49 | | 字段                   | 示例                                 | 说明         |
 50 | | ---------------------- | ------------------------------------ | ------------ |
 51 | | result.mid             | 4264355334054790                     | 微博唯一标识 |
 52 | | result.uid             | 1225419417                           | 用户唯一标识 |
 53 | | result.text            | 哇！抽到了！爱国宝                   | 微博正文     |
 54 | | result.reposts_count   | 14                                   | 转发数量     |
 55 | | result.comments_count  | 114                                  | 评论数量     |
 56 | | result.attitudes_count | 1481                                 | 点赞数量     |
 57 | | result.source          | iPhone X                             | 来源         |
 58 | | result.updated_at      | 2018-08-10 00:02:09                  | 抓取时间     |
 59 | | result.created_at      | 2018-07-21 22:56:41                  | 发表时间     |
 60 | | result.images          | ["490a6a99gy1fthvjguf0gj20v91voqbr"] | 图片列表     |
 61 | | result.is_long_text    | False                                | 是否为长微博 |
 62 | 
 63 | ```python
 64 | self.result = [
 65 |     {
 66 |         'mid': 4278823505781372,
 67 |         'uid': 2094949595,
 68 |         'text': '杭州的绿水青山留下了许多诗句，和风熏，杨柳轻，郁郁青山江水平，笑语满香径。什么使你爱上了这座城市?{网页链接}(https://weibo.com/tv/v/Gw6iL1Q0e?fid=1034:4276507087207862) \u200b',
 69 |         'reposts_count': 1,
 70 |         'comments_count': 1,
 71 |         'attitudes_count': 2,
 72 |         'source': '微博 weibo.com',
 73 |         'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904636),
 74 |         'created_at': datetime.datetime(2018, 8, 30, 21, 8, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
 75 |         'images': [],
 76 |         'is_long_text': False
 77 |     },
 78 |     {
 79 |         'mid': 4278785248875113,
 80 |         'uid': 2094949595,
 81 |         'text': '你当时学的专业是什么？你现在又在做什么工作呢？ \u200b',
 82 |         'reposts_count': 0,
 83 |         'comments_count': 12,
 84 |         'attitudes_count': 1,
 85 |         'source': '微博 weibo.com',
 86 |         'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 11, 904846),
 87 |         'created_at': datetime.datetime(2018, 8, 30, 18, 36, 3, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
 88 |         'images': ['7cde64dbgy1furl2c240jj20e80cujs2'],
 89 |         'is_long_text': False
 90 |     },
 91 | ]
 92 | 
 93 | ```
 94 | 
 95 | 
 96 | 
 97 | ### 转发微博（UserTask）
 98 | 
 99 | 每个用户前10条微博中的转发微博
100 | 
101 | | 字段                   | 示例                   | 说明                            |
102 | | ---------------------- | ---------------------- | ------------------------------- |
103 | | result.mid             | 4269756171586532       | 微博唯一标识                    |
104 | | result.uid             | 1680938527             | 用户唯一标识                    |
105 | | result.text            | //@李宇春如初:转发微博 | 微博正文                        |
106 | | result.reposts_count   | 0                      | 转发数量                        |
107 | | result.comments_count  | 0                      | 评论数量                        |
108 | | result.attitudes_count | 0                      | 点赞数量                        |
109 | | result.source          | iPhone客户端           | 来源                            |
110 | | result.pmid            | 4269752379437757       | 父级微博的mid（上层转发，可空） |
111 | | result.smid            | 4269748974496983       | 源微博的mid（原创微博）         |
112 | | result.suid            | 5427461387             | 源微博的uid                     |
113 | | result.updated_at      | 2018-08-10 00:02:02    | 抓取时间                        |
114 | | result.created_at      | 2018-08-05 20:37:42    | 发表时间                        |
115 | 
116 | ```python
117 | self.result = [
118 |     {'mid': 4278871820165470,
119 |      'uid': 1802393212,
120 |      'text': '这壁纸超萌哦，喜欢就快来打call @Line壁纸酱',
121 |      'reposts_count': 0,
122 |      'comments_count': 0,
123 |      'attitudes_count': 2,
124 |      'source': '皮皮时光机',
125 |      'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250057),
126 |      'created_at': datetime.datetime(2018, 8, 31, 0, 20, 2, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
127 |      'pmid': 0,
128 |      'smid': 4278723350035431,
129 |      'suid': 6150916523
130 |      },
131 |     {
132 |         'mid': 4278866795185761,
133 |         'uid': 1802393212,
134 |         'text': '[心]',
135 |         'reposts_count': 0,
136 |         'comments_count': 0,
137 |         'attitudes_count': 2,
138 |         'source': '皮皮时光机',
139 |         'updated_at': datetime.datetime(2018, 8, 31, 0, 38, 12, 250450),
140 |         'created_at': datetime.datetime(2018, 8, 31, 0, 0, 4, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))),
141 |         'pmid': 0,
142 |         'smid': 4266013078506248,
143 |         'suid': 5604000425}
144 | ]
145 | ```
146 | 
147 | 
148 | 
149 | ### 关注列表（FollowerTask）
150 | 
151 | 每个用户最后180个关注、部分大V关注
152 | 
153 | | 字段   | 示例                     | 说明                    |
154 | | ------ | ------------------------ | ----------------------- |
155 | | result | [5427461387, 1680938527] | 关注列表中所有用户的uid |
156 | 
157 | ```python
158 | self.result = [
159 |     1199430302, 5291824241, 1744583555, 1225627080, 1192504311, 1539469391, 1831216671, 1855790127,
160 | ]
161 | ```
162 | 
163 | + 通过self.uid获取当前用户UID
164 | 
165 | 
166 | 
167 | ### 粉丝列表（FanTask）
168 | 
169 | 每个用户最后4500个粉丝、部分大V粉丝
170 | 
171 | | 字段   | 示例                       | 说明                    |
172 | | ------ | -------------------------- | ----------------------- |
173 | | result | [5427461387", "1680938527] | 粉丝列表中所有用户的uid |
174 | 
175 | ```python
176 | self.result = [
177 |     2011541160, 6561198332, 5650361179, 5203386014, 6586203686, 3975892466, 5280555723, 6200526771,
178 | ]
179 | ```
180 | 
181 | + 通过self.uid获取当前用户UID
182 | 
183 | 
184 | 
185 | ### 附录
186 | 
187 | #### verified_type
188 | 
189 | |值|含义|
190 | |:---|:---|
191 | |-1|无认证|
192 | |0|个人认证|
193 | |1|政府|
194 | |2|企业|
195 | |3|媒体|
196 | |4|校园|
197 | |5|网站|
198 | |6|应用|
199 | |7|机构|
200 | |8|待审企业|
201 | |200|初级达人|
202 | |220|中高级达人|
203 | |400|已故V用户|
204 | 
205 | 


--------------------------------------------------------------------------------
/spiders/WeiBo/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 | 


--------------------------------------------------------------------------------
/spiders/WeiBo/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "user:1111681197",
 4 |     "user:1863847262"
 5 | ]
 6 | # 调度间隔时间（秒）
 7 | # 控制当前爬虫的抓取频率
 8 | interval = 0.2
 9 | # 任务超时时间（秒）
10 | # 超时后，将被移入tag='timeout'的异常队列中
11 | timeout = 120
12 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
13 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
14 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
15 | # 特别注意，此字段一经设置不可更改
16 | precision = 0.00001
17 | # 自定义参数
18 | # Task中使用self.args访问这里的args
19 | args = {}
20 | 


--------------------------------------------------------------------------------
/spiders/WeiBo/tasks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import datetime
  3 | import itertools
  4 | from pyloom import tasks
  5 | from pyloom.errors import *
  6 | 
  7 | 
  8 | class PWATask(tasks.Task):
  9 |     _redis = None
 10 | 
 11 |     def __init__(self, *args, **kwargs):
 12 |         super(PWATask, self).__init__(*args, **kwargs)
 13 |         self.uid = self.url.split(":")[1]
 14 |         self.client.headers = {
 15 |             'Accept': 'application/json, text/plain, */*',
 16 |             'Referer': f'https://m.weibo.cn/profile/{self.uid}',
 17 |             'MWeibo-Pwa': '1',
 18 |             'X-Requested-With': 'XMLHttpRequest',
 19 |             'User-Agent': self.ua.chrome
 20 |         }
 21 | 
 22 |     @tasks.retry(tries=16, delay=0, backoff=0)
 23 |     def download(self, url, params):
 24 |         """下载并判断是否被封禁"""
 25 |         try:
 26 |             resp = self.client.get(url, params, timeout=8)
 27 |         except (ProxyError, RequestError) as e:
 28 |             if self.download.count >= 3:  # 重试两次后更换代理
 29 |                 self.logger.debug("请求错误", e)
 30 |                 self.client.reload_proxy()
 31 |             raise RetryError
 32 |         try:
 33 |             errno = resp.json.get('errno', None)
 34 |         except JSONDecodeError:
 35 |             errno = None
 36 |         if resp.status_code == 418 or (resp.status_code == 403 and errno == "100005"):
 37 |             self.logger.debug("响应包错误，IP已被封禁")
 38 |             self.client.reuse_proxy(150)
 39 |             self.client.reload_proxy()
 40 |             raise RetryError
 41 |         elif errno == "20003":
 42 |             self.logger.debug("响应包错误，用户不存在", self.uid)
 43 |             raise TaskFinish()
 44 |         elif resp.status_code != 200:
 45 |             self.logger.debug(f"响应包错误，状态码:{resp.status_code}", self.uid)
 46 |             if self.download.count >= 3:
 47 |                 self.client.reload_proxy()
 48 |             raise RetryError
 49 |         elif errno is not None:
 50 |             msg = resp.json['msg']
 51 |             self.logger.debug(f"响应包错误，errno={errno}，msg={msg}", self.uid)
 52 |             raise TaskError("msg:" + msg)
 53 |         else:
 54 |             self.client.reuse_proxy(0)
 55 |             return resp
 56 | 
 57 | 
 58 | class UserTask(PWATask):
 59 |     """用户资料"""
 60 |     filters = "user:\w+"
 61 | 
 62 |     def on_download(self):
 63 |         return self.download('https://m.weibo.cn/profile/info', {'uid': self.uid})
 64 | 
 65 |     def parse_text(self, _text):
 66 |         """转换微博内容以节约空间"""
 67 | 
 68 |         # 将表情包转为:[拜拜]
 69 |         def replacer_first(match):
 70 |             return match.groups()[0]
 71 | 
 72 |         text = re.sub(
 73 |             r'<span[^>]+class="url-icon">\s*<img alt="([^"]+)".+?</span>', replacer_first, _text
 74 |         )
 75 | 
 76 |         # 将链接转为:{title}(url)
 77 |         # title一般为话题
 78 |         def replacer_link(match):
 79 |             groups = match.groups()
 80 |             return f"{{{groups[1]}}}({groups[0]})"
 81 | 
 82 |         text = re.sub(
 83 |             r'<a[^>]+href="([^"]+)".*?>\s*<span class="surl-text">([^<>]+)</span>\s*</a>',
 84 |             replacer_link, text
 85 |         )
 86 |         # 将@连接转为: @XXX
 87 |         text = re.sub(r'<a href=[^>]+>(@[^<>]+)</a>', replacer_first, text)
 88 |         # 将<br />转为\n
 89 |         text = text.replace("<br />", "\n")
 90 |         return text
 91 | 
 92 |     def parse_status(self, _status):
 93 |         """递归提取源微博与被转发微博"""
 94 |         status = {
 95 |             'mid': int(_status['id']),
 96 |             'uid': _status['user']['id'],
 97 |             'text': self.parse_text(_status['text']),
 98 |             'reposts_count': _status['reposts_count'],
 99 |             'comments_count': _status['comments_count'],
100 |             'attitudes_count': _status['attitudes_count'],
101 |             'source': _status['source'],
102 |             'updated_at': datetime.datetime.now(),
103 |             'created_at': datetime.datetime.strptime(
104 |                 _status['created_at'], "%a %b %d %H:%M:%S %z %Y"),
105 |         }
106 |         retweeted_status = _status.get('retweeted_status', None)
107 |         if retweeted_status:  # 转发
108 |             status['pmid'] = _status.get('pid', 0)
109 |             status['smid'] = int(retweeted_status['id'])
110 |             status['suid'] = int(retweeted_status['user']['id'])
111 |             repost = status
112 |             status, _ = self.parse_status(retweeted_status)
113 |             return status, repost
114 |         else:  # 原创
115 |             status['images'] = _status['pic_ids']
116 |             status['is_long_text'] = _status['isLongText']
117 |             return status, None
118 | 
119 |     def on_parse(self):
120 |         # 用户信息
121 |         _user = self.response.json['data']['user']
122 |         user = {
123 |             'uid': _user['id'],
124 |             'screen_name': _user['screen_name'],
125 |             'statuses_count': _user['statuses_count'],
126 |             'verified_type': _user['verified_type'],
127 |             'verified_type_ext': _user.get('verified_type_ext', -1),
128 |             'description': _user['description'],
129 |             'gender': _user['gender'],
130 |             'mbtype': _user['mbtype'],
131 |             'urank': _user['urank'],
132 |             'mbrank': _user['mbrank'],
133 |             'followers_count': _user['followers_count'],
134 |             'follow_count': _user['follow_count'],
135 |             'profile_image_id': _user['profile_image_url'].rsplit("/", 1)[1].split(".")[0],
136 |             'status': 0,
137 |             'updated_at': datetime.datetime.now()
138 |         }
139 |         # 最近微博
140 |         statuses = []
141 |         reposts = []
142 |         for _status in self.response.json['data']['statuses']:
143 |             status, repost = self.parse_status(_status)
144 |             if status:
145 |                 statuses.append(status)
146 |             if repost:
147 |                 reposts.append(repost)
148 | 
149 |         return user, statuses, reposts
150 | 
151 |     def on_link(self):
152 |         return {
153 |             3: [f'follow:{self.uid}'],
154 |             4: [f'fan:{self.uid}']
155 |         }
156 | 
157 |     def on_save(self):
158 |         self.logger.info("抓到用户信息", self.result[0])
159 |         if self.result[1]:
160 |             self.logger.info("抓到原创微博", self.result[1])
161 |         if self.result[2]:
162 |             self.logger.info("抓到转发微博", self.result[2])
163 | 
164 | 
165 | class ContainerTask(PWATask):
166 |     """解析关注和粉丝列表的响应包"""
167 | 
168 |     def on_parse(self):
169 |         targets = []
170 |         for page in self.response:
171 |             cards = page.json['data']['cards']
172 |             for card in cards:
173 |                 style = card.get('card_style', None)
174 |                 group = card['card_group']
175 |                 if style is None:  # 普通用户
176 |                     targets.extend(g['user']['id'] for g in group)
177 |                 elif style == 1:  # 推荐用户
178 |                     if len(group) == 3 and 'scheme' in group[2]:  # 相关大V用户
179 |                         if 'users' in group[1]:
180 |                             ids = [user['id'] for user in group[1]['users']]
181 |                         elif 'user' in group[1]:
182 |                             ids = [group[1]['user']['id']]
183 |                         else:
184 |                             ids = []
185 |                     else:  # 大V用户
186 |                         ids = [g['user']['id'] for g in group if 'user' in g]
187 |                     targets.extend(ids)
188 |                 else:
189 |                     raise TaskError(f"card_style={style}")
190 |         _targets = []
191 |         for t in targets:
192 |             try:
193 |                 _targets.append(int(t))
194 |             except ValueError:
195 |                 pass
196 |         return _targets
197 | 
198 |     def on_link(self):
199 |         return {1: [f"user:{uid}" for uid in self.result]} if self.result else {}
200 | 
201 | 
202 | class FollowerTask(ContainerTask):
203 |     """关注列表"""
204 |     filters = "follow:\w+"
205 | 
206 |     def on_download(self):
207 |         pages = []
208 |         url = "https://m.weibo.cn/api/container/getIndex"
209 |         for page_id in itertools.count(1):
210 |             params = {"containerid": f"231051_-_followers_-_{self.uid}"}
211 |             if page_id != 1:
212 |                 params['page'] = page_id
213 |             resp = self.download(url, params)
214 |             if resp.json['ok'] == 0:  # 已到最后一页
215 |                 break
216 |             pages.append(resp)
217 |         return pages
218 | 
219 |     def on_save(self):
220 |         self.logger.info("抓到关注列表", self.result)
221 | 
222 | 
223 | class FanTask(ContainerTask):
224 |     """粉丝列表"""
225 |     filters = "fan:\w+"
226 | 
227 |     def on_download(self):
228 |         pages = []
229 |         url = "https://m.weibo.cn/api/container/getIndex"
230 |         for since_id in itertools.count(1):
231 |             params = {"containerid": f"231051_-_fans_-_{self.uid}"}
232 |             if since_id != 1:
233 |                 params['since_id'] = since_id
234 |             resp = self.download(url, params)
235 |             if resp.json['ok'] == 0:  # 已到最后一页
236 |                 break
237 |             pages.append(resp)
238 |         return pages
239 | 
240 |     def on_save(self):
241 |         self.logger.info("抓到粉丝列表", self.result)
242 | 


--------------------------------------------------------------------------------
/spiders/Ziroom/README.md:
--------------------------------------------------------------------------------
  1 | ## 自如爬虫
  2 | 
  3 | ### 安装
  4 | 
  5 | 1. [安装tesseract-ocr](https://github.com/tesseract-ocr/tesseract/wiki)
  6 | 
  7 | 2. 安装pytesseract库
  8 | 
  9 |    ```
 10 |    pip install pytesseract
 11 |    ```
 12 | 
 13 | 
 14 | ### 房源列表(NLTask)
 15 | 
 16 | 房源列表中的房源信息
 17 | 
 18 | | 字段             | 示例                                                  | 说明         |
 19 | | ---------------- | ----------------------------------------------------- | ------------ |
 20 | | result.price     | 1160                                                  | 价格         |
 21 | | result.href      | www.ziroom.com/z/vr/61441027.html                     | 详情信息链接 |
 22 | | result.img_src   | static8.ziroom.com/phoenix/pc/images/list/loading.jpg | 图片         |
 23 | | result.block     | 天恒乐活城                                            | 小区名       |
 24 | | result.name      | 整租 · 天恒乐活城2居室-南                             | 房源名       |
 25 | | result.site      | [通州通州其它] 亦庄线次渠南                           | 位置         |
 26 | | result.detail    | 14.1 ㎡\|6/6层\|3室1厅距15号线石门站690米             | 细节         |
 27 | | result.room_tags | ['离地铁近', '独立阳台', '集体供暖', '友家3.0 木棉']  | 标签         |
 28 | 
 29 | ```python
 30 | self.result = [
 31 |     {
 32 | 	'price': '1830', 
 33 |      	'href': 'www.ziroom.com/z/vr/61514855.html',
 34 |      	'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg',
 35 |      	'block': '世茂维拉', 'name': '友家 · 世茂维拉5居室-南卧',
 36 |      	'site': '[房山长阳] 房山线广阳城',
 37 |      	'detail': '12.3 ㎡|5/5层|5室1厅距房山线广阳城站696米有2间空房',
 38 |      	'room_tags': ['离地铁近', '独卫', '集体供暖', '友家4.0 拿铁']
 39 |     }, 
 40 |     {
 41 | 	'price': '1830',
 42 |      	'href': 'www.ziroom.com/z/vr/261810.html',
 43 |      	'img_src': 'static8.ziroom.com/phoenix/pc/images/list/loading.jpg',
 44 |      	'block': '前进花园石门苑',
 45 |      	'name': '友家 · 前进花园石门苑3居室-南卧',
 46 |      	'site': '[顺义顺义城] 15号线石门',
 47 |      	'detail': '14.1 ㎡|6/6层|3室1厅距15号线石门站690米',
 48 |      	'room_tags': ['离地铁近', '独立阳台', '集体供暖', '友家4.0 布丁']
 49 |     },
 50 |     ...
 51 | ]
 52 | ```
 53 | 
 54 | 
 55 | 
 56 | ### 房源详情(VRTask)
 57 | 
 58 | | 字段                     | 示例                                                     | 说明                                       |
 59 | | ------------------------ | -------------------------------------------------------- | ------------------------------------------ |
 60 | | result.img               | ['http://pic.ziroom.com/house_images.jpg',...]           | 介绍图片                                   |
 61 | | result.room_name         | 和平家园小区4居室-02卧                                   | 名称                                       |
 62 | | result.ellipsis          | [昌平 沙河] 昌平线 昌平                                  | 位置                                       |
 63 | | result.room_id           | 61264525                                                 | 房间id                                     |
 64 | | result.house_id          | 60203175                                                 | 房屋id                                     |
 65 | | result.current_city_code | 110000                                                   | 所在城市编码，见[附录](#current_city_code) |
 66 | | result.detail_room       | {'面积': '15.4㎡', '朝向': '南', '户型': '4室1厅合',...} | 房屋参数                                   |
 67 | | result.number            | BJZRGY0818215849_02                                      | 编号                                       |
 68 | | result.periphery         | 学校：中国政法大学法学院、中国石油大学...                | 周边                                       |
 69 | | result.traffic           | 公交：314路、昌平9路...                                  | 交通                                       |
 70 | | result.configuration     | ['bed', 'desk', 'chest', 'calorifier']                   | 配置                                       |
 71 | | result.roommate          | [{性别': 'man', '房间号': '01卧', '星座': '天蝎座',...}] | 室友信息                                   |
 72 | | result.price             | 1890                                                     | 价格                                       |
 73 | 
 74 | ```python
 75 | self.result = {
 76 |     'img':[
 77 | 	'http://pic.ziroom.com/house_images/g2m1/M00/5B/DF/v180x135.jpg',
 78 | 	'http://pic.ziroom.com/house_images/g2m1/M00/5D/0B/v180x135.jpg'
 79 |     ],
 80 |     'room_name': '和平家园小区4居室-02卧', 
 81 |     'ellipsis': '[昌平 沙河] 昌平线 昌平', 
 82 |     'room_id': '61264525', 
 83 |     'house_id': '60203175',
 84 |     'current_city_code': '110000',
 85 |     'detail_room': {
 86 |         '面积': '15.4㎡',
 87 |         '朝向': '南',
 88 |         '户型': '4室1厅合',
 89 |         '楼层': '6/6层',
 90 | 	'交通': '距15号线石门307米距15号线顺义1621米距15号线南法信2290米'
 91 |     }, 
 92 |     'number': 'BJZRGY0818215849_02', 
 93 |     'periphery': '学校：中国政法大学法学院 医院：北京化工大学校医院',
 94 |     'traffic': '公交：314路、昌平9路、914路、昌平3路、昌平5路、326路、345路', 
 95 |     'configuration': ['bed', 'desk', 'chest', 'calorifier], 
 96 |     'roommate':[
 97 |         { 
 98 | 	    '性别': 'man',
 99 |             '房间号': '01卧',
100 |             '星座': '天蝎座',
101 |             '职业': '产品',
102 |             '入住时间': '2018/07-2019/07'
103 |         },
104 |         {
105 |             '性别': 'current',
106 |             '房间号': '02卧',
107 |             '星座': '…',
108 |             '职业': '…',
109 |             '入住时间': '…'
110 |         }
111 |     ], 
112 |     'price': 1890
113 | }
114 | ```
115 | 
116 | + 房间室友（result.roommate）不存在时，性别为current，'…'代表信息为空
117 | 
118 | ### 附录
119 | 
120 | #### current_city_code
121 | 
122 | | 城市编码 | 城市名称 |
123 | | -------- | -------- |
124 | | 110000   | 北京     |
125 | | 310000   | 上海     |
126 | | 440300   | 深圳     |
127 | | 330100   | 杭州     |
128 | | 320100   | 南京     |
129 | | 440100   | 广州     |
130 | | 510100   | 成都     |
131 | | 420100   | 武汉     |
132 | | 120000   | 天津     |
133 | 
134 | 


--------------------------------------------------------------------------------
/spiders/Ziroom/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import *
2 | 


--------------------------------------------------------------------------------
/spiders/Ziroom/configs.py:
--------------------------------------------------------------------------------
 1 | # 爬虫初始化时填入队列的种子页面
 2 | seeders = [
 3 |     "http://www.ziroom.com/z/nl/z3.html"
 4 | ]
 5 | # 调度间隔时间（秒）
 6 | # 控制当前爬虫的抓取频率
 7 | interval = 0
 8 | # 任务超时时间（秒）
 9 | # 超时后，将被移入tag='timeout'的异常队列中
10 | timeout = 120
11 | # 设置BloomFilter精度，用于过滤'已完成'的URL，避免重复抓取
12 | # 若精度设置过低，会造成过多的页面被误报为'已完成'
13 | # 应权衡爬虫对误报的忍耐度与服务器内存消耗，酌情更改
14 | # 特别注意，此字段一经设置不可更改
15 | precision = 0.0001
16 | # 自定义参数
17 | # Task中使用self.args访问这里的args
18 | args = {}
19 | 


--------------------------------------------------------------------------------
/spiders/Ziroom/tasks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import requests
  4 | import pytesseract
  5 | import urllib.parse
  6 | from PIL import Image
  7 | from io import BytesIO
  8 | from pyloom import tasks
  9 | from pyloom.errors import *
 10 | 
 11 | 
 12 | class ZiRoomTask(tasks.Task):
 13 |     @tasks.retry(5, 0)
 14 |     def on_download(self):
 15 |         # 解决列表页第一页冲突问题
 16 |         page = re.search('\\?p=1', self.url)
 17 |         if page is not None:
 18 |             self.client.reuse_proxy()
 19 |             raise TaskFinish
 20 |         try:
 21 |             response = self.client.get(
 22 |                 url=self.url,
 23 |                 allow_redirects=False,
 24 |                 headers={
 25 |                     "User-Agent": self.ua.chrome
 26 |                 }
 27 |             )
 28 |         except (ProxyError, RequestError) as e:
 29 |             if self.on_download.count >= 5:  # 重试两次后更换代理
 30 |                 self.logger.debug("请求错误", e)
 31 |                 self.client.reload_proxy()
 32 |             raise RetryError
 33 | 
 34 |         if "请核对您输入的页面地址是否正确" in response.text or "The requested URL could not be retrieved" in response.text:
 35 |             if self.on_download.count >= 5:  # 重试两次后更换代理
 36 |                 self.logger.info("请求次数", self.on_download.count)
 37 |                 self.client.reload_proxy()
 38 |             else:
 39 |                 self.client.reuse_proxy(5)
 40 |             raise RetryError
 41 |         if response.status_code == 302:
 42 |             if self.on_download.count >= 5:  # 重试两次后更换代理
 43 |                 self.logger.info("请求次数", self.on_download.count)
 44 |                 self.client.reload_proxy()
 45 |             else:
 46 |                 self.client.reuse_proxy(5)
 47 |             raise RetryError
 48 |         if response.status_code == 500:
 49 |             raise RetryError
 50 |         self.client.reuse_proxy()
 51 |         return response
 52 | 
 53 |     @staticmethod
 54 |     def get_price(response):
 55 |         """通过图像匹配返回房租价格"""
 56 |         image_url = re.search('static8.ziroom.com/phoenix/pc/images/price/(\w+).png', response.text)[0]
 57 |         image = Image.open(BytesIO(requests.get(f'http://{image_url}').content))
 58 |         digital_table = pytesseract.image_to_string(image, config='--psm 7')
 59 |         offset_list = re.search('\\[((,)?\\[(\w)(,\w)+\\])+\\]', response.text)[0]
 60 |         price_list = []
 61 |         for offset in offset_list[2:-2].split('],['):
 62 |             a = ""
 63 |             for offset_num in offset.split(','):
 64 |                 a = a + (digital_table[int(offset_num)])
 65 |             price_list.append(a)
 66 |         return price_list
 67 | 
 68 | 
 69 | class NLTask(ZiRoomTask):
 70 |     filters = "http://(\w+).ziroom.com/z/nl/\S+"
 71 | 
 72 |     def on_parse(self):
 73 |         """解析链接"""
 74 |         house_list = self.response.css.many('#houseList li')
 75 |         houses = []
 76 |         if self.response.css.one('.nomsg').default(None).text() is None:
 77 |             price_list = self.get_price(self.response)
 78 |             for house in house_list:
 79 |                 houses.append(
 80 |                     {
 81 |                         'price': price_list[len(houses)],
 82 |                         'href': house.one('.img a').attrs['href'][2:],
 83 |                         'img_src': house.one('.img a img').attrs['src'][2:],
 84 |                         'block': house.one('.img a img').attrs.get('alt', None),
 85 |                         'name': house.one('.txt h3 a').text(),
 86 |                         'site': house.one('.txt h4 a').text(),
 87 |                         'detail': house.one('.txt .detail').text(),
 88 |                         'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')]
 89 |                     }
 90 |                 )
 91 |         else:
 92 |             for house in house_list:
 93 |                 houses.append(
 94 |                     {
 95 |                         'price': re.search("(\d+)", house.one('.price').text())[0],
 96 |                         'href': house.one('.img a').attrs['href'][2:],
 97 |                         'img_src': house.one('.img a img').attrs['src'][2:],
 98 |                         'block': house.one('.img a img').attrs.get('alt', None),
 99 |                         'name': house.one('.txt h3 a').text(),
100 |                         'site': house.one('.txt h4 a').text(),
101 |                         'detail': house.one('.txt .detail').text(),
102 |                         'room_tags': [tags.text() for tags in house.many('.txt .room_tags span')]
103 |                     }
104 |                 )
105 |         return houses
106 | 
107 |     def on_link(self):
108 |         paths = list(set(self.response.re.many('\w+.ziroom.com/z/nl/\S+?.html\\??p?=?\d*')))
109 |         return {
110 |             2: [f'http://{house["href"]}' for house in self.result],
111 |             4: [f'http://{path}' for path in paths]
112 |         }
113 | 
114 |     def on_save(self):
115 |         self.logger.info(f'抓到房源列表 {self.result}')
116 | 
117 | 
118 | class VRTask(ZiRoomTask):
119 |     filters = "http://(\w+).ziroom.com/z/vr/(\w+)"
120 | 
121 |     def on_parse(self):
122 |         detail_room = {}
123 |         for i in self.response.css.many(".detail_room li"):
124 |             detail = re.sub('\s', '', i.text()).split('：')
125 |             detail_room[detail[0]] = detail[1]
126 |         info = {
127 |             'img': [img.attrs['src'] for img in self.response.css.many('.lidiv img')],
128 |             'room_name': self.response.css.one('.room_name h2').default(None).text(),
129 |             'ellipsis': ' '.join(filter(
130 |                         lambda x: x, self.response.css.one('.room_detail_right .ellipsis').text().split())),
131 |             'room_id': self.response.css.one('#room_id').attrs.get("value"),
132 |             'house_id': self.response.css.one('#house_id').attrs.get("value"),
133 |             'current_city_code': self.response.css.one('#current_city_code').attrs.get('value'),
134 |             'detail_room': detail_room,
135 |             'number': self.response.css.one('.aboutRoom h3').text()[3:],
136 |             'periphery': self.response.css.many('.aboutRoom p')[0].text()[3:],
137 |             'traffic': self.response.css.many('.aboutRoom p')[1].text()[3:]
138 |         }
139 |         roommate = []
140 |         for i in self.response.css.many('.greatRoommate li'):
141 |             roommate.append({
142 |                 '性别': i.attrs.get("class")[0],
143 |                 '房间号': i.one('.user_top p').text(),
144 |                 '星座': i.one('.sign').text()[0:-2],
145 |                 '职业': i.one('.jobs').text()[0:-2],
146 |                 '入住时间': i.one('.user_bottom p').text(),
147 |             })
148 |         info['roommate'] = roommate
149 |         conf = self.client.get(
150 |             url=f"http://www.ziroom.com/detail/config?house_id={info['house_id']}&id={info['room_id']}",
151 |             headers={
152 |                 "User-Agent": self.ua.chrome
153 |             }
154 |         )
155 |         configuration = []
156 |         for i in conf.json['data']:
157 |             if conf.json['data'].get(i) == 1:
158 |                 configuration.append(i)
159 |         info['configuration'] = configuration
160 |         cookies = self.response.cookies.get_dict()
161 |         for cookie in cookies:
162 |             if 'nlist' in cookie:
163 |                 info['price'] = json.loads(urllib.parse.unquote(
164 |                     self.response.cookies.get_dict()[cookie]))[info["room_id"]]['sell_price']
165 |                 break
166 |         return info
167 | 
168 |     def on_save(self):
169 |         self.logger.info(f'抓到房源信息 {self.result}')
170 | 


--------------------------------------------------------------------------------