├── weibo
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── image.py
    │   └── video.py
    ├── configs.example.py
    ├── items.py
    ├── api.py
    ├── utils.py
    ├── pipelines.py
    ├── settings.py
    └── middlewares.py
├── debug.py
├── scrapy.cfg
├── README.md
├── LICENSE
└── .gitignore


/weibo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/debug.py:
--------------------------------------------------------------------------------
1 | """
2 | This script is for debugging only.
3 | """
4 | from scrapy.cmdline import execute
5 | 
6 | if __name__ == '__main__':
7 |     cmd = 'scrapy crawl video'
8 |     execute(cmd.split())
9 | 


--------------------------------------------------------------------------------
/weibo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weibo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weibo
12 | 


--------------------------------------------------------------------------------
/weibo/configs.example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains weibo configs for the spider.
 3 | """
 4 | 
 5 | STORE_PATH = './downloads'
 6 | CACHE_FILE = f'{STORE_PATH}/cache.pkl'
 7 | 
 8 | COOKIES = 'key=value; key=value; ...'
 9 | 
10 | TARGETS = [
11 |     'https://weibo.com/u/0000000000',
12 |     'https://weibo.com/u/1111111111',
13 | ]
14 | 


--------------------------------------------------------------------------------
/weibo/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class WeiboItem(scrapy.Item):
10 |     uuid = scrapy.Field()  # as the cache key
11 |     filename = scrapy.Field()
12 |     file_urls = scrapy.Field()
13 |     files = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/weibo/api.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains weibo's api links.
 3 | """
 4 | 
 5 | 
 6 | def info(uid: str) -> str:
 7 |     return f'https://weibo.com/ajax/profile/info?uid={uid}'
 8 | 
 9 | 
10 | def get_image_wall(uid: str, since: str = '0') -> str:
11 |     return f'https://weibo.com/ajax/profile/getImageWall?uid={uid}&sinceid={since}'
12 | 
13 | 
14 | def get_water_fall(uid: str, cursor: str = '0') -> str:
15 |     return f'https://weibo.com/ajax/profile/getWaterFallContent?uid={uid}&cursor={cursor}'
16 | 
17 | 
18 | def large_image(pid: str, cdn: int = 1) -> str:
19 |     return f'https://wx{cdn}.sinaimg.cn/large/{pid}'
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Weibo Album Crawler
 2 | 
 3 | ![python](https://img.shields.io/badge/python-3.10-blue)
 4 | ![scrapy](https://img.shields.io/badge/scrapy-v2.5-blue)
 5 | 
 6 | 基于 Scrapy 的新浪微博爬虫，支持相册、视频等。
 7 | 
 8 | ## 设置环境
 9 | 
10 | ```shell
11 | conda create -n weibo python=3.10
12 | conda activate weibo
13 | pip install scrapy
14 | ```
15 | 
16 | ## 配置爬虫
17 | 
18 | * `weibo/settings.py`
19 |   * 并发请求数 `CONCURRENT_REQUESTS`
20 |   * 视频下载目录 `FILES_STORE`
21 | 
22 | * `weibo/configs.py`
23 |   * 生成配置文件 `cp weibo/configs.example.py weibo/configs.py`
24 |   * 手动复制粘贴登录后的 cookies 至 `COOKIES`
25 |   * 目标主页 `TARGETS`
26 |   * 下载目录 `STORE_PATH`
27 | 
28 | ## 运行
29 | 
30 | ```shell
31 | scrapy crawl image
32 | scrapy crawl video
33 | ```
34 | 


--------------------------------------------------------------------------------
/weibo/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from scrapy import logformatter
 5 | 
 6 | 
 7 | class LogFormatter(logformatter.LogFormatter):
 8 |     """
 9 |     Set DropItem to the debug level because we need to drop a lot of items.
10 |     """
11 | 
12 |     def dropped(self, item, exception, response, spider):
13 |         formatter = super().dropped(item, exception, response, spider)
14 |         formatter['level'] = logging.DEBUG
15 |         return formatter
16 | 
17 | 
18 | def prepare_folder(uid: str, uname: str, store_dir: str):
19 |     """
20 |     Migrate a target's folder (with the ald uname, if any) to the new name.
21 |     This is useful if some target changed their name.
22 |     """
23 |     new_folder = f'{uid}_{uname}'
24 | 
25 |     for old_folder in os.listdir(store_dir):
26 |         if old_folder.startswith(f'{uid}_') and old_folder != new_folder:
27 |             src = os.path.join(store_dir, old_folder)
28 |             dst = os.path.join(store_dir, new_folder)
29 |             os.rename(src, dst)
30 |             break
31 | 
32 |     return new_folder
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Yue Gao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/weibo/spiders/image.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from urllib.parse import urlparse
 3 | 
 4 | import scrapy
 5 | 
 6 | from weibo import api, utils
 7 | from weibo import configs
 8 | from weibo.items import WeiboItem
 9 | 
10 | 
11 | class ImageSpider(scrapy.Spider):
12 |     name = 'image'
13 |     allowed_domains = ['weibo.com']
14 | 
15 |     def start_requests(self):
16 |         for target in configs.TARGETS:
17 |             uid = os.path.basename(urlparse(target.rstrip('/')).path)
18 |             yield scrapy.Request(api.info(uid), callback=self.parse_info)
19 | 
20 |     def parse_info(self, response):
21 |         # prepare data
22 |         user = response.json()['user']
23 |         uid, uname = user['id'], user['screen_name']
24 | 
25 |         # prepare user folder
26 |         folder = utils.prepare_folder(uid, uname, configs.STORE_PATH)
27 | 
28 |         # start from the 1st page
29 |         meta = {'uid': uid, 'folder': folder}
30 |         yield scrapy.Request(api.get_image_wall(uid), callback=self.parse_image_wall, meta=meta)
31 | 
32 |     def parse_image_wall(self, response):
33 |         # prepare data
34 |         data = response.json()
35 |         uid, folder = response.meta['uid'], response.meta['folder']
36 | 
37 |         # continue to next page
38 |         since = data['since_id']
39 |         yield scrapy.Request(api.get_image_wall(uid, since), callback=self.parse_image_wall, meta=response.meta)
40 | 
41 |         # yield all images
42 |         self.logger.info(f'{folder} found {len(data["list"]):2d} images (from {response.url})')
43 |         for image in data['list']:
44 |             pid, mid = image['pid'], image['mid']
45 |             filename = f'{folder}/{mid}_{pid}.jpg'
46 |             yield WeiboItem(uuid=pid, filename=filename, file_urls=[api.large_image(pid)])
47 | 


--------------------------------------------------------------------------------
/weibo/pipelines.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from scrapy.exceptions import DropItem
 5 | from scrapy.pipelines.files import FilesPipeline
 6 | 
 7 | 
 8 | class WeiboMediaPipeline(FilesPipeline):
 9 |     """
10 |     Pipeline to download media files with specified filename.
11 |     """
12 | 
13 |     def file_path(self, request, response=None, info=None, *, item=None):
14 |         return item['filename']
15 | 
16 | 
17 | class BaseMediaKeyCachePipeline(object):
18 |     """
19 |     Base class for key-cache pipelines.
20 | 
21 |     This pipeline cache keys of downloaded images / videos, so they will not be downloaded even if you deleted them.
22 |     Useful if you want to delete unwanted files forever.
23 |     """
24 | 
25 |     # do we need to load existing key cache?
26 |     preload_cache: bool
27 | 
28 |     def __init__(self, cache_file: str):
29 |         self.cache_file = cache_file
30 |         self.keys_seen = self.load_cache() if self.preload_cache else set()
31 | 
32 |     @classmethod
33 |     def from_crawler(cls, crawler):
34 |         return cls(crawler.settings['CACHE_FILE'])
35 | 
36 |     def load_cache(self):
37 |         cache = set()
38 |         if os.path.exists(self.cache_file):
39 |             with open(self.cache_file, 'rb') as fp:
40 |                 cache = pickle.load(fp)
41 |         return cache
42 | 
43 | 
44 | class MediaKeyDuplicatesPipeline(BaseMediaKeyCachePipeline):
45 |     """
46 |     Pipeline to drop items having cached keys.
47 |     Must be placed BEFORE downloading pipelines.
48 |     """
49 | 
50 |     # preload cache to check duplicates
51 |     preload_cache = True
52 | 
53 |     def process_item(self, item, spider):
54 |         if item['uuid'] in self.keys_seen:
55 |             raise DropItem(f'Duplicate media key found in item.')
56 |         return item
57 | 
58 | 
59 | class MediaKeyCachePipeline(BaseMediaKeyCachePipeline):
60 |     """
61 |     Pipeline to cache keys of newly downloaded items.
62 |     Must be placed AFTER downloading pipelines.
63 |     """
64 | 
65 |     # no need to preload cache for updating
66 |     preload_cache = False
67 | 
68 |     def close_spider(self, spider):
69 |         cache = self.keys_seen | self.load_cache()
70 |         with open(self.cache_file, 'wb') as fp:
71 |             pickle.dump(cache, fp)
72 | 
73 |     def process_item(self, item, spider):
74 |         if item['files']:
75 |             self.keys_seen.add(item['uuid'])
76 |         return item
77 | 


--------------------------------------------------------------------------------
/weibo/spiders/video.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from urllib.parse import urlparse
 3 | 
 4 | import scrapy
 5 | 
 6 | from weibo import api, utils
 7 | from weibo import configs
 8 | from weibo.items import WeiboItem
 9 | 
10 | 
11 | class VideoSpider(scrapy.Spider):
12 |     name = 'video'
13 |     allowed_domains = ['weibo.com']
14 |     download_warnsize = 100 << 20  # 100 MB
15 |     download_timeout = 10 * 60  # 10 min
16 |     video_keys = ['mp4_720p_mp4', 'mp4_hd_url', 'mp4_sd_url']
17 | 
18 |     def start_requests(self):
19 |         for target in configs.TARGETS:
20 |             uid = os.path.basename(urlparse(target.rstrip('/')).path)
21 |             yield scrapy.Request(api.info(uid), callback=self.parse_info)
22 | 
23 |     def parse_info(self, response):
24 |         # prepare data
25 |         user = response.json()['user']
26 |         uid, uname = user['id'], user['screen_name']
27 | 
28 |         # prepare user folder
29 |         folder = utils.prepare_folder(uid, uname, configs.STORE_PATH)
30 | 
31 |         # start from the first page
32 |         meta = {'uid': uid, 'folder': folder}
33 |         yield scrapy.Request(api.get_water_fall(uid), callback=self.parse_water_fall, meta=meta)
34 | 
35 |     def parse_water_fall(self, response):
36 |         # prepare data
37 |         data = response.json()
38 |         uid, folder = response.meta['uid'], response.meta['folder']
39 | 
40 |         # continue to next page
41 |         cursor = data['next_cursor']
42 |         yield scrapy.Request(api.get_water_fall(uid, cursor), callback=self.parse_water_fall, meta=response.meta)
43 | 
44 |         # yield all videos
45 |         for video in data['list']:
46 |             video, mid = video['page_info'], video['mid']
47 |             video_type = video['object_type']
48 | 
49 |             match video_type:
50 |                 case 'video':
51 |                     urls = [video['media_info'][key] for key in self.video_keys]
52 |                     url = urls[0] if urls else ''
53 |                     self.logger.info(f'{folder} found 1 video (from {response.url})')
54 |                     yield WeiboItem(uuid=mid, filename=f'{folder}/{mid}.mp4', file_urls=[url])
55 | 
56 |                 case 'story':
57 |                     for i, slide in enumerate(video['slide_cover']['slide_videos']):
58 |                         url = slide['url']
59 |                         yield WeiboItem(uuid=f'{mid}_{i}', filename=f'{folder}/{mid}_{i}.mp4', file_urls=[url])
60 | 
61 |                 case _:
62 |                     self.logger.warning('Unknown video type "%s".', video_type)
63 | 


--------------------------------------------------------------------------------
/weibo/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for weibo project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | from weibo import configs
10 | 
11 | BOT_NAME = 'weibo'
12 | 
13 | SPIDER_MODULES = ['weibo.spiders']
14 | NEWSPIDER_MODULE = 'weibo.spiders'
15 | 
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'weibo (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | #   'Accept-Language': 'en',
44 | #}
45 | 
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | #    'weibo.middlewares.WeiboSpiderMiddleware': 543,
50 | #}
51 | 
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 |    'weibo.middlewares.CustomCookiesMiddleware': 543,
56 |    'weibo.middlewares.WeiboAPIMiddleware': 543,
57 | }
58 | 
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'weibo.pipelines.MediaKeyDuplicatesPipeline': 300,
69 |    'weibo.pipelines.WeiboMediaPipeline': 310,
70 |    'weibo.pipelines.MediaKeyCachePipeline': 320,
71 | }
72 | FILES_STORE = configs.STORE_PATH
73 | CACHE_FILE = configs.CACHE_FILE
74 | 
75 | LOG_FORMATTER = 'weibo.utils.LogFormatter'
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/weibo/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | from http.cookies import SimpleCookie
  6 | from json import JSONDecodeError
  7 | 
  8 | from scrapy import signals
  9 | from scrapy.exceptions import IgnoreRequest
 10 | 
 11 | from weibo import configs
 12 | 
 13 | 
 14 | # useful for handling different item types with a single interface
 15 | 
 16 | 
 17 | class WeiboSpiderMiddleware:
 18 |     # Not all methods need to be defined. If a method is not defined,
 19 |     # scrapy acts as if the spider middleware does not modify the
 20 |     # passed objects.
 21 | 
 22 |     @classmethod
 23 |     def from_crawler(cls, crawler):
 24 |         # This method is used by Scrapy to create your spiders.
 25 |         s = cls()
 26 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 27 |         return s
 28 | 
 29 |     def process_spider_input(self, response, spider):
 30 |         # Called for each response that goes through the spider
 31 |         # middleware and into the spider.
 32 | 
 33 |         # Should return None or raise an exception.
 34 |         return None
 35 | 
 36 |     def process_spider_output(self, response, result, spider):
 37 |         # Called with the results returned from the Spider, after
 38 |         # it has processed the response.
 39 | 
 40 |         # Must return an iterable of Request, or item objects.
 41 |         for i in result:
 42 |             yield i
 43 | 
 44 |     def process_spider_exception(self, response, exception, spider):
 45 |         # Called when a spider or process_spider_input() method
 46 |         # (from other spider middleware) raises an exception.
 47 | 
 48 |         # Should return either None or an iterable of Request or item objects.
 49 |         pass
 50 | 
 51 |     def process_start_requests(self, start_requests, spider):
 52 |         # Called with the start requests of the spider, and works
 53 |         # similarly to the process_spider_output() method, except
 54 |         # that it doesn’t have a response associated.
 55 | 
 56 |         # Must return only requests (not items).
 57 |         for r in start_requests:
 58 |             yield r
 59 | 
 60 |     def spider_opened(self, spider):
 61 |         spider.logger.info('Spider opened: %s' % spider.name)
 62 | 
 63 | 
 64 | class WeiboDownloaderMiddleware:
 65 |     # Not all methods need to be defined. If a method is not defined,
 66 |     # scrapy acts as if the downloader middleware does not modify the
 67 |     # passed objects.
 68 | 
 69 |     @classmethod
 70 |     def from_crawler(cls, crawler):
 71 |         s = cls()
 72 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 73 |         return s
 74 | 
 75 |     def process_request(self, request, spider):
 76 |         # Called for each request that goes through the downloader
 77 |         # middleware.
 78 | 
 79 |         # Must either:
 80 |         # - return None: continue processing this request
 81 |         # - or return a Response object
 82 |         # - or return a Request object
 83 |         # - or raise IgnoreRequest: process_exception() methods of
 84 |         #   installed downloader middleware will be called
 85 |         return None
 86 | 
 87 |     def process_response(self, request, response, spider):
 88 |         # Called with the response returned from the downloader.
 89 | 
 90 |         # Must either;
 91 |         # - return a Response object
 92 |         # - return a Request object
 93 |         # - or raise IgnoreRequest
 94 |         return response
 95 | 
 96 |     def process_exception(self, request, exception, spider):
 97 |         # Called when a download handler or a process_request()
 98 |         # (from other downloader middleware) raises an exception.
 99 | 
100 |         # Must either:
101 |         # - return None: continue processing this exception
102 |         # - return a Response object: stops process_exception() chain
103 |         # - return a Request object: stops process_exception() chain
104 |         pass
105 | 
106 |     def spider_opened(self, spider):
107 |         spider.logger.info('Spider opened: %s' % spider.name)
108 | 
109 | 
110 | class CustomCookiesMiddleware(object):
111 |     """
112 |     Add custom cookies to each request.
113 |     """
114 | 
115 |     def __init__(self):
116 |         cookies = SimpleCookie()
117 |         cookies.load(configs.COOKIES)
118 |         self.cookies = {k: m.value for k, m in cookies.items()}
119 | 
120 |     def process_request(self, request, spider):
121 |         request.cookies = self.cookies
122 | 
123 | 
124 | class WeiboAPIMiddleware(object):
125 |     """
126 |     Extract the "data" field of api responses.
127 |     """
128 | 
129 |     def process_response(self, request, response, spider):
130 |         if 'ajax' in response.url:
131 |             try:
132 |                 json = response.json()
133 |             except JSONDecodeError:
134 |                 raise IgnoreRequest(f'Cookie expired or API changed: cannot parse json from {response.url}')
135 | 
136 |             if json.get('ok') != 1 or 'data' not in json:
137 |                 raise IgnoreRequest(f'API {response.url} returns invalid data: {json}')
138 | 
139 |             response._cached_decoded_json = json['data']
140 | 
141 |         return response
142 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Private configs
  2 | weibo/configs.py
  3 | 
  4 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+all
  5 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+all
  6 | 
  7 | ### PyCharm+all ###
  8 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  9 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 10 | 
 11 | # User-specific stuff
 12 | .idea/**/workspace.xml
 13 | .idea/**/tasks.xml
 14 | .idea/**/usage.statistics.xml
 15 | .idea/**/dictionaries
 16 | .idea/**/shelf
 17 | 
 18 | # AWS User-specific
 19 | .idea/**/aws.xml
 20 | 
 21 | # Generated files
 22 | .idea/**/contentModel.xml
 23 | 
 24 | # Sensitive or high-churn files
 25 | .idea/**/dataSources/
 26 | .idea/**/dataSources.ids
 27 | .idea/**/dataSources.local.xml
 28 | .idea/**/sqlDataSources.xml
 29 | .idea/**/dynamic.xml
 30 | .idea/**/uiDesigner.xml
 31 | .idea/**/dbnavigator.xml
 32 | 
 33 | # Gradle
 34 | .idea/**/gradle.xml
 35 | .idea/**/libraries
 36 | 
 37 | # Gradle and Maven with auto-import
 38 | # When using Gradle or Maven with auto-import, you should exclude module files,
 39 | # since they will be recreated, and may cause churn.  Uncomment if using
 40 | # auto-import.
 41 | # .idea/artifacts
 42 | # .idea/compiler.xml
 43 | # .idea/jarRepositories.xml
 44 | # .idea/modules.xml
 45 | # .idea/*.iml
 46 | # .idea/modules
 47 | # *.iml
 48 | # *.ipr
 49 | 
 50 | # CMake
 51 | cmake-build-*/
 52 | 
 53 | # Mongo Explorer plugin
 54 | .idea/**/mongoSettings.xml
 55 | 
 56 | # File-based project format
 57 | *.iws
 58 | 
 59 | # IntelliJ
 60 | out/
 61 | 
 62 | # mpeltonen/sbt-idea plugin
 63 | .idea_modules/
 64 | 
 65 | # JIRA plugin
 66 | atlassian-ide-plugin.xml
 67 | 
 68 | # Cursive Clojure plugin
 69 | .idea/replstate.xml
 70 | 
 71 | # SonarLint plugin
 72 | .idea/sonarlint/
 73 | 
 74 | # Crashlytics plugin (for Android Studio and IntelliJ)
 75 | com_crashlytics_export_strings.xml
 76 | crashlytics.properties
 77 | crashlytics-build.properties
 78 | fabric.properties
 79 | 
 80 | # Editor-based Rest Client
 81 | .idea/httpRequests
 82 | 
 83 | # Android studio 3.1+ serialized cache file
 84 | .idea/caches/build_file_checksums.ser
 85 | 
 86 | ### PyCharm+all Patch ###
 87 | # Ignores the whole .idea folder and all .iml files
 88 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
 89 | 
 90 | .idea/*
 91 | 
 92 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 93 | 
 94 | *.iml
 95 | modules.xml
 96 | .idea/misc.xml
 97 | *.ipr
 98 | 
 99 | # Sonarlint plugin
100 | .idea/sonarlint
101 | 
102 | ### Python ###
103 | # Byte-compiled / optimized / DLL files
104 | __pycache__/
105 | *.py[cod]
106 | *$py.class
107 | 
108 | # C extensions
109 | *.so
110 | 
111 | # Distribution / packaging
112 | .Python
113 | build/
114 | develop-eggs/
115 | dist/
116 | downloads/
117 | eggs/
118 | .eggs/
119 | lib/
120 | lib64/
121 | parts/
122 | sdist/
123 | var/
124 | wheels/
125 | share/python-wheels/
126 | *.egg-info/
127 | .installed.cfg
128 | *.egg
129 | MANIFEST
130 | 
131 | # PyInstaller
132 | #  Usually these files are written by a python script from a template
133 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
134 | *.manifest
135 | *.spec
136 | 
137 | # Installer logs
138 | pip-log.txt
139 | pip-delete-this-directory.txt
140 | 
141 | # Unit test / coverage reports
142 | htmlcov/
143 | .tox/
144 | .nox/
145 | .coverage
146 | .coverage.*
147 | .cache
148 | nosetests.xml
149 | coverage.xml
150 | *.cover
151 | *.py,cover
152 | .hypothesis/
153 | .pytest_cache/
154 | cover/
155 | 
156 | # Translations
157 | *.mo
158 | *.pot
159 | 
160 | # Django stuff:
161 | *.log
162 | local_settings.py
163 | db.sqlite3
164 | db.sqlite3-journal
165 | 
166 | # Flask stuff:
167 | instance/
168 | .webassets-cache
169 | 
170 | # Scrapy stuff:
171 | .scrapy
172 | 
173 | # Sphinx documentation
174 | docs/_build/
175 | 
176 | # PyBuilder
177 | .pybuilder/
178 | target/
179 | 
180 | # Jupyter Notebook
181 | .ipynb_checkpoints
182 | 
183 | # IPython
184 | profile_default/
185 | ipython_config.py
186 | 
187 | # pyenv
188 | #   For a library or package, you might want to ignore these files since the code is
189 | #   intended to run in multiple environments; otherwise, check them in:
190 | # .python-version
191 | 
192 | # pipenv
193 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
194 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
195 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
196 | #   install all needed dependencies.
197 | #Pipfile.lock
198 | 
199 | # poetry
200 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
201 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
202 | #   commonly ignored for libraries.
203 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
204 | #poetry.lock
205 | 
206 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
207 | __pypackages__/
208 | 
209 | # Celery stuff
210 | celerybeat-schedule
211 | celerybeat.pid
212 | 
213 | # SageMath parsed files
214 | *.sage.py
215 | 
216 | # Environments
217 | .env
218 | .venv
219 | env/
220 | venv/
221 | ENV/
222 | env.bak/
223 | venv.bak/
224 | 
225 | # Spyder project settings
226 | .spyderproject
227 | .spyproject
228 | 
229 | # Rope project settings
230 | .ropeproject
231 | 
232 | # mkdocs documentation
233 | /site
234 | 
235 | # mypy
236 | .mypy_cache/
237 | .dmypy.json
238 | dmypy.json
239 | 
240 | # Pyre type checker
241 | .pyre/
242 | 
243 | # pytype static type analyzer
244 | .pytype/
245 | 
246 | # Cython debug symbols
247 | cython_debug/
248 | 
249 | # PyCharm
250 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
251 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
252 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
253 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
254 | #.idea/
255 | 
256 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+all


--------------------------------------------------------------------------------