├── .gitattributes
├── .gitignore
├── README.md
├── bilibili
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── redis_n.py
    ├── settings.py
    ├── spiders
    │   ├── __init__.py
    │   └── bilibili_spider.py
    ├── start.py
    └── useragent.py
├── get_aid.py
├── requirements.txt
└── scrapy.cfg


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bilibili_video_stat
 2 | 爬取b站视频信息，供大数据分析用户喜好。使用scrapy-redis分布式，在16核服务器上实现抓取2500万条/天。可长期部署抓取，实现视频趋势分析
 3 | 
 4 | - 1.提供代理ip池
 5 | - 2.提供user agent池
 6 | - 3.使用scrapy-redis分布式
 7 | - 4.使用mongodb保存数据（也可使用mysql，提供了相应代码）
 8 | - 5.使用多进程（单个爬虫线程数请根据设备情况自行在settings.py中修改）
 9 | - 6.提供完整error记录
10 | - 7.提供自定义重试中间件UserRetryMiddleware（如使用，请在settings.py中修改）
11 | - 8.提供在redis中添加starurls的代码，请阅读redis_n.py文件
12 | 
13 | ### 启动命令
14 | 
15 | ```shell
16 | pip3 install -r requirements.txt
17 | python3 start.py 进程数
18 | # python3 start.py 32
19 | ```
20 | 
21 | - 建议配合代理ip池使用，提供 https://github.com/arthurmmm/hq-proxies 代理池的调用中间件，需在settings中开启中间件
22 | 
23 | - 请在使用前设置settings.py中的redis和mongodb的连接
24 | 
25 | - pipeline.py注释了使用mysql保存的代码，如使用，请在settings.py添加相应设置
26 | 
27 | ## 千万级大数据量/天的爬取经验：
28 | 
29 | - 1.请准备较大的代理IP池：若数据量在400条/秒，根据b站封ip的测试（短时间请求1000次则封ip），代理池需维持300-400的代理ip
30 | 
31 | - 2.请使用短有效期的代理ip：代理ip有效期在3-5分钟需300-400的代理池IP量，若有效期较长，需增加代理池的ip量
32 | 
33 | - 3.请准备较多的user agent组成池，useragent.py中的量差不多了
34 | 
35 | - 4.请开启重试：超时时间调小为10s，重试次数增加为10；实际测试中，重试1次约占30%，重试2次及以上约占10%，最多的重试了6次；重试占比于代理ip质量相关
36 | 
37 | - 5.若出现所有请求全部都重试的情况，请开启DOWNLOAD_DELAY = 0.01；实际测试，即使只延迟0.01s也会显著减少重试次数；若重试情况仍严重，请增加至0.25s
38 | 
39 | - 6.需详细记录失败的url，保持数据的完整性
40 | 


--------------------------------------------------------------------------------
/bilibili/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wangler2333/bilibili_video_stat/c0814ea06920a5ca8fdb4504c09995d0ee16c9f4/bilibili/__init__.py


--------------------------------------------------------------------------------
/bilibili/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BilibiliItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 
16 | 
17 | class BiliBiliData(scrapy.Item):
18 |     aid = scrapy.Field()
19 |     view = scrapy.Field()
20 |     danmaku = scrapy.Field()
21 |     reply = scrapy.Field()
22 |     favorite = scrapy.Field()
23 |     coin = scrapy.Field()
24 |     share = scrapy.Field()
25 |     time = scrapy.Field()
26 | 


--------------------------------------------------------------------------------
/bilibili/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | from redis import StrictRedis
 8 | from scrapy.conf import settings
 9 | from scrapy import signals
10 | import random  # 随机选择
11 | from .useragent import agents  # 导入前面的
12 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware  # UserAegent中间件
13 | from scrapy.downloadermiddlewares.retry import RetryMiddleware  # 重写重试中间件
14 | 
15 | 
16 | class UserRetryMiddleware(RetryMiddleware):
17 |     # 自定义重试中间件，未启用
18 |     def _retry(self, request, reason, spider):
19 |         redis_db = StrictRedis(
20 |             host=settings["REDIS_HOST"],
21 |             port=settings["REDIS_PORT"],
22 |             password=settings["REDIS_PASSWORD"],
23 |             db=settings["REDIS_PROXY_DB"],
24 |         )
25 |         print(request.url)
26 |         redis_db.lpush("bilibili_spider:strat_urls", request.url)
27 | 
28 | 
29 | class UserAgentmiddleware(UserAgentMiddleware):
30 |     # 随机user agent的中间件
31 |     def process_request(self, request, spider):
32 |         agent = random.choice(agents)
33 |         request.headers["User-Agent"] = agent
34 | 
35 | 
36 | class BilibiliDownloaderMiddleware(object):
37 |     @classmethod
38 |     def from_crawler(cls, crawler):
39 |         s = cls()
40 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
41 |         crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
42 |         crawler.signals.connect(s.spider_error, signal=signals.spider_error)
43 |         return s
44 | 
45 |     def process_request(self, request, spider):
46 |         # 使用代理ip池的中间件
47 |         # 配合https://github.com/arthurmmm/hq-proxies使用
48 |         # 若使用请在settings中开启
49 |         redis_db = StrictRedis(
50 |             host=settings["REDIS_HOST"],
51 |             port=settings["REDIS_PORT"],
52 |             password=settings["REDIS_PASSWORD"],
53 |             db=settings["REDIS_PROXY_DB"],
54 |         )
55 | 
56 |         proxy = redis_db.srandmember("hq-proxies:proxy_pool", 1)
57 |         if proxy:
58 |             proxy = proxy[0].decode()
59 |             spider.logger.info('使用代理[%s]访问[%s]' % (proxy, request.url))
60 | 
61 |             request.meta['proxy'] = proxy
62 |         else:
63 |             spider.logger.warning('不使用代理访问[%s]' % request.url)
64 |         return None
65 | 
66 |     def process_response(self, request, response, spider):
67 |         return response
68 | 
69 |     def process_exception(self, request, exception, spider):
70 |         pass
71 | 
72 |     def spider_opened(self, spider):
73 |         spider.logger.info('爬虫开启: %s' % spider.name)
74 | 
75 |     def spider_closed(self, spider):
76 |         spider.logger.info('爬虫关闭: %s' % spider.name)
77 | 
78 |     def spider_error(self, failure, response, spider):
79 |         # 增加记录爬虫报错的函数，连接spider_error信号
80 |         spider.logger.error('[%s],错误:%s' % (response.url, failure.getTraceback()))
81 |         with open("./error_spider.txt", "a") as fa:
82 |             fa.write(response.url)
83 |             fa.write("\n")
84 |         with open("./error_spider_info.txt", "a") as fb:
85 |             fb.write("Error on {0}, traceback: {1}".format(response.url, failure.getTraceback()))
86 |             fb.write("\n")
87 | 


--------------------------------------------------------------------------------
/bilibili/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # import pymysql
 8 | from scrapy.conf import settings
 9 | import logging as logger
10 | import time
11 | from pymongo import MongoClient
12 | 
13 | 
14 | class BilibiliPipeline(object):
15 |     def __init__(self):
16 |         # 建立数据库连接
17 |         client = MongoClient(settings["MONGO_HOST"], settings["MONGO_PORT"])
18 |         # 连接目标数据库
19 |         db = client["bilibili"]
20 |         db.authenticate(settings["MONGO_USERNAME"], settings["MONGO_PASSWORD"])
21 |         # 连接集合
22 |         # 根据当前日期建立集合
23 |         col_name = "b_video_stat_" + time.strftime("%Y%m%d")
24 |         col = db[col_name]
25 | 
26 |         self.col = col
27 | 
28 |     def process_item(self, item, spider):
29 |         try:
30 |             data = dict(item)
31 |             self.col.insert_one(data)
32 |         except Exception as error:
33 |             # 记录保存错误的url
34 |             logger.error(error)
35 |             with open("./error_mongo.txt", "a") as fb:
36 |                 fb.write("aid:" + str(item["aid"]))
37 |                 fb.write("\n")
38 |         return item
39 | 
40 |     # 保存到mysql
41 |     # def process_item(self, item, spider):
42 |     #     # 连接数据库
43 |     #     connect = pymysql.connect(
44 |     #         host=settings.MYSQL_HOST,
45 |     #         db=settings.MYSQL_DBNAME,
46 |     #         user=settings.MYSQL_USER,
47 |     #         passwd=settings.MYSQL_PASSWD,
48 |     #         charset='utf8mb4',
49 |     #         use_unicode=True)
50 |     #
51 |     #     # 通过cursor执行增删查改
52 |     #     cursor = connect.cursor()
53 |     #     try:
54 |     #         cursor.execute('''insert into b_video_stat(aid,view,danmaku,reply,favorite,coin,share)
55 |     #                                 values (%d,%d,%d,%d,%d,%d,%d)''' % (item["aid"],
56 |     #                                                                     item["view"],
57 |     #                                                                     item["danmaku"],
58 |     #                                                                     item["reply"],
59 |     #                                                                     item["favorite"],
60 |     #                                                                     item["coin"],
61 |     #                                                                     item["share"]))
62 |     #         # 提交sql语句
63 |     #         connect.commit()
64 |     #     except Exception as error:
65 |     #         # 出现错误时打印错误日志
66 |     #         logging.error((error, item))
67 |     #
68 |     #     connect.close()
69 |     #     return item
70 | 


--------------------------------------------------------------------------------
/bilibili/redis_n.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8 
 3 | # @version: 
 4 | # @author: liduo
 5 | # @license: 
 6 | # @file: redis_n.py
 7 | # @time: 2018/5/30 下午5:05
 8 | from redis import StrictRedis
 9 | import settings
10 | from concurrent.futures import ThreadPoolExecutor
11 | 
12 | 
13 | def run(t):
14 |     # 向redis传入strat_urls
15 |     start = t[0]
16 |     stop = t[1]
17 |     redis_db = StrictRedis(
18 |         host=settings.REDIS_HOST,
19 |         port=settings.REDIS_PORT,
20 |         password=settings.REDIS_PASSWORD,
21 |         db=0,
22 |     )
23 | 
24 |     for i in range(start, stop):
25 |         print(i)
26 | 
27 |         redis_db.lpush("bilibili_spider:strat_urls", "https://api.bilibili.com/x/web-interface/archive/stat?aid=%s" % i)
28 | 
29 | 
30 | def main(start, stop, step):
31 |     # 开启多线程
32 |     num_list = []
33 |     # 为每个线程分配aid，组成列表
34 |     for i in range(start, stop, step):
35 |         start_i = i
36 |         stop_i = i + step
37 |         t = (start_i, stop_i)
38 |         num_list.append(t)
39 |     # 设置线程池大小，或动态生成大小
40 |     pool_num = 32
41 |     # pool_num = (stop - start) // step
42 |     with ThreadPoolExecutor(pool_num) as executor:
43 |         # 使用map动态生成线程
44 |         executor.map(run, num_list)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     import sys
49 | 
50 |     # 接收参数，start，stop，step
51 |     # start：起始aid
52 |     # stop：结束aid
53 |     # step：每个线程负责aid的数量
54 |     start_num = int(sys.argv[1])
55 |     stop_num = int(sys.argv[2])
56 |     step_num = int(sys.argv[3])
57 |     main(start_num, stop_num, step_num)
58 | 


--------------------------------------------------------------------------------
/bilibili/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # BOT_NAME = 'bilibili'
  4 | 
  5 | SPIDER_MODULES = ['bilibili.spiders']
  6 | NEWSPIDER_MODULE = 'bilibili.spiders'
  7 | 
  8 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
  9 | # USER_AGENT = 'bilibili (+http://www.yourdomain.com)'
 10 | 
 11 | # Obey robots.txt rules
 12 | # ROBOTSTXT_OBEY = True
 13 | 
 14 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 15 | # 爬虫并发线程数，根据机器性能修改
 16 | CONCURRENT_REQUESTS = 1024
 17 | 
 18 | # Configure a delay for requests for the same website (default: 0)
 19 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 20 | # See also autothrottle settings and docs
 21 | # DOWNLOAD_DELAY = 0.01
 22 | # The download delay setting will honor only one of:
 23 | # 每个domain并发量，设置为超大值，代表不限制
 24 | CONCURRENT_REQUESTS_PER_DOMAIN = 100000000
 25 | # 单个ip并发量，设置为0，代表不限制
 26 | CONCURRENT_REQUESTS_PER_IP = 0
 27 | 
 28 | # Disable cookies (enabled by default)
 29 | # 禁用cookies
 30 | COOKIES_ENABLED = False
 31 | 
 32 | # Disable Telnet Console (enabled by default)
 33 | # 不使用Telnet远程调试
 34 | TELNETCONSOLE_ENABLED = False
 35 | 
 36 | # Override the default request headers:
 37 | DEFAULT_REQUEST_HEADERS = {
 38 |     # 设置默认user agent
 39 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
 40 |     "Accept-Language": "zh-CN,zh;q=0.9",
 41 | }
 42 | 
 43 | DOWNLOADER_MIDDLEWARES = {
 44 |     # 重试
 45 |     'scrapy.downloadermiddlewares.retry.RetryMiddleware': 530,
 46 |     # 代理ip
 47 |     # 'bilibili.middlewares.BilibiliDownloaderMiddleware': 540,
 48 |     # 随机user agent
 49 |     'bilibili.middlewares.UserAgentmiddleware': 544,
 50 | }
 51 | 
 52 | ITEM_PIPELINES = {
 53 |     'bilibili.pipelines.BilibiliPipeline': 300,
 54 |     # 若希望保存item在redis中则打开RedisPipeline
 55 |     # 'scrapy_redis.pipelines.RedisPipeline': 310,
 56 | }
 57 | 
 58 | # 允许所有状态码通过
 59 | # HTTPERROR_ALLOW_ALL = True
 60 | 
 61 | # 重试次数
 62 | RETRY_ENABLED = True
 63 | RETRY_TIMES = 10
 64 | 
 65 | # 下载超时
 66 | DOWNLOAD_TIMEOUT = 10
 67 | 
 68 | # log日志设置
 69 | LOG_FILE = 'mySpider.log'
 70 | LOG_LEVEL = 'WARNING'
 71 | 
 72 | # mongodb设置
 73 | MONGO_HOST = "ip"
 74 | MONGO_PORT = 27017
 75 | MONGO_USERNAME = "user"  # 若无为None
 76 | MONGO_PASSWORD = "password"  # 若无为None
 77 | 
 78 | #########################################################################
 79 | # Scrapy-Rdis配置项
 80 | # 启用Redis调度存储请求队列
 81 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 82 | 
 83 | # 确保所有的爬虫通过Redis去重
 84 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 85 | 
 86 | # 默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS：这玩意儿2.X的可以用。3.X的不能用
 87 | # SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
 88 | 
 89 | # 不清除Redis队列、这样可以暂停/恢复 爬取
 90 | # SCHEDULER_PERSIST = True
 91 | 
 92 | # 使用优先级调度请求队列 （默认使用）
 93 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
 94 | # 可选用的其它队列
 95 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
 96 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
 97 | 
 98 | # 最大空闲时间防止分布式爬虫因为等待而关闭
 99 | # 这只有当上面设置的队列类是SpiderQueue或SpiderStack时才有效
100 | # 并且当您的蜘蛛首次启动时，也可能会阻止同一时间启动（由于队列为空）
101 | # SCHEDULER_IDLE_BEFORE_CLOSE = 10
102 | 
103 | 
104 | # 序列化项目管道作为redis Key存储
105 | # REDIS_ITEMS_KEY = '%(spider)s:items'
106 | 
107 | # 默认使用ScrapyJSONEncoder进行项目序列化
108 | # You can use any importable path to a callable object.
109 | # REDIS_ITEMS_SERIALIZER = 'json.dumps'
110 | 
111 | # 指定连接到redis时使用的端口和地址（可选）
112 | # 与代理ip公用设置
113 | REDIS_HOST = 'ip'
114 | REDIS_PORT = 6379
115 | REDIS_PASSWORD = 'password'  # 无密码为None
116 | REDIS_PROXY_DB = 10
117 | 
118 | # 指定用于连接redis的URL（可选）
119 | # 如果设置此项，则此项优先级高于设置的REDIS_HOST 和 REDIS_PORT
120 | REDIS_URL = 'redis://user:password@ip:6379'
121 | 
122 | # 自定义的redis参数（连接超时之类的）
123 | # REDIS_PARAMS  = {}
124 | 
125 | # 自定义redis客户端类
126 | # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient'
127 | 
128 | # 如果为True，则使用redis的'spop'进行操作。
129 | # 如果需要避免起始网址列表出现重复，这个选项非常有用。开启此选项urls必须通过sadd添加，否则会出现类型错误。
130 | # REDIS_START_URLS_AS_SET = False
131 | 
132 | # RedisSpider和RedisCrawlSpider默认 start_usls 键
133 | # REDIS_START_URLS_KEY = '%(name)s:start_urls'
134 | 
135 | # 设置redis使用utf-8之外的编码
136 | # REDIS_ENCODING = 'latin1'
137 | 


--------------------------------------------------------------------------------
/bilibili/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bilibili/spiders/bilibili_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy_redis.spiders import RedisSpider
 3 | import logging
 4 | from scrapy.exceptions import CloseSpider
 5 | from bilibili.items import BiliBiliData
 6 | import json
 7 | import time
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class BilibiliSpiderSpider(RedisSpider):
13 |     name = 'bilibili_spider'
14 |     # allowed_domains = ['bilibili.com']
15 |     # 启动爬虫的命令
16 |     redis_key = "bilibili_spider:strat_urls"
17 | 
18 |     def parse(self, response):
19 |         try:
20 |             # 若settings中HTTPERROR_ALLOW_ALL = True，则需检测状态吗
21 |             if response.status not in [200, 301, 302, 303, 307]:
22 |                 raise CloseSpider("网址:%s 状态码异常:%s" % (response.url, response.status))
23 |         except CloseSpider as error:
24 |             logger.error(error)
25 |         else:
26 |             try:
27 |                 # 解析json数据
28 |                 json_data = json.loads(response.text)
29 |             except Exception as error:
30 |                 # 若解析错误，记录url
31 |                 json_data = {"code": 403}
32 |                 logger.error((response.url, error))
33 |                 with open("./error_json.txt", "a") as fb:
34 |                     fb.write(response.url)
35 |                     fb.write("\n")
36 | 
37 |             item = BiliBiliData()
38 |             if json_data["code"] == 0:
39 |                 # 解析json数据，若为"--"则计为0
40 |                 data = json_data["data"]
41 |                 item['aid'] = data.get("aid")
42 |                 item['view'] = data.get("view", 0) if data.get("view", 0) != "--" else 0
43 |                 item['danmaku'] = data.get("danmaku", 0) if data.get("danmaku", 0) != "--" else 0
44 |                 item['reply'] = data.get("reply", 0) if data.get("reply", 0) != "--" else 0
45 |                 item['favorite'] = data.get("favorite", 0) if data.get("favorite", 0) != "--" else 0
46 |                 item['coin'] = data.get("coin", 0) if data.get("coin", 0) != "--" else 0
47 |                 item['share'] = data.get("share", 0) if data.get("share", 0) != "--" else 0
48 |                 item['time'] = time.time()
49 | 
50 |                 yield item
51 | 
52 |             logger.info("爬取完成:%s" % response.url)
53 |         # 因logging等级设为了WARNING，则在log中增加一条完成记录
54 |         logger.warning("完成:[%s]" % response.url)
55 | 


--------------------------------------------------------------------------------
/bilibili/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8 
 3 | # @version: 
 4 | # @author: liduo
 5 | # @license: 
 6 | # @file: start.py
 7 | # @time: 2018/5/30 下午10:24
 8 | from multiprocessing import Pool
 9 | import os
10 | 
11 | 
12 | def run():
13 |     # 发送命令，启动一个爬虫
14 |     cmd = "scrapy crawl bilibili_spider"
15 |     os.system(cmd)
16 | 
17 | 
18 | def main(number):
19 |     # 创建进程池
20 |     p = Pool(number)
21 |     for n in range(number):
22 |         p.apply_async(run)
23 |     p.close()
24 |     p.join()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     import sys
29 | 
30 |     # 接收传入的参数，代表开启几个scrapy-redis进程
31 |     num = sys.argv[1]
32 |     num = int(num)
33 |     print("开启[%s]个进程" % num)
34 |     main(num)
35 |     print("进程结束")
36 | 


--------------------------------------------------------------------------------
/bilibili/useragent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8 
 3 | # @version: 
 4 | # @author: liduo
 5 | # @license: 
 6 | # @file: useragent.py
 7 | # @time: 2018/5/30 下午3:56
 8 | agents = [
 9 |     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
10 |     "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
11 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
12 |     "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
13 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
14 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
15 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
16 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
17 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
18 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
19 |     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
20 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
21 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
22 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
23 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
24 |     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
25 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
26 |     "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
27 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
28 |     "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
29 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
30 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
31 |     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
32 |     "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
33 |     "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
34 |     "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
35 |     "Mozilla/2.02E (Win95; U)",
36 |     "Mozilla/3.01Gold (Win95; I)",
37 |     "Mozilla/4.8 [en] (Windows NT 5.1; U)",
38 |     "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
39 |     "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
40 |     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
41 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
42 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
43 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
44 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
45 |     "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
46 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
47 |     "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
48 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
49 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
50 |     "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
51 |     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
52 |     "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
53 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
54 |     "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
55 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
56 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
57 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
58 |     "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
59 |     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
60 |     "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
61 |     "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
62 |     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
63 |     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
64 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
65 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
66 |     "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
67 |     "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
68 |     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
69 |     "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
70 |     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
71 | ]
72 | 
73 | if __name__ == '__main__':
74 |     pass
75 | 


--------------------------------------------------------------------------------
/get_aid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8 
 3 | # @version: 
 4 | # @author: liduo
 5 | # @license: 
 6 | # @file: get_aid.py
 7 | # @time: 2018/6/3 下午5:05
 8 | from pymongo import MongoClient
 9 | 
10 | MONGO_HOST = "ssh.leepycode.com"
11 | MONGO_PORT = 27017
12 | 
13 | # 建立数据库连接
14 | client = MongoClient(MONGO_HOST, MONGO_PORT)
15 | # 连接目标数据库
16 | db = client["bilibili_data"]
17 | # 连接集合
18 | col_name = "b_video_stat_" + "20180601"
19 | col = db[col_name]
20 | fb = open("./aid.txt", "a")
21 | 
22 | 
23 | def run(aid):
24 |     data = col.find({"aid": aid})
25 |     try:
26 |         if data[0]:
27 |             fb.write(str(aid))
28 |             fb.write("\n")
29 |     except:
30 |         pass
31 | 
32 | 
33 | def main(start, stop):
34 |     for n in range(start, stop):
35 |         print(n)
36 |         run(n)
37 |     fb.close()
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     main(0, 25000000)
42 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pymongo==3.6.1
2 | PyMySQL==0.8.1
3 | redis==2.10.6
4 | Scrapy==1.5.0
5 | scrapy-redis==0.6.8


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bilibili.settings
 8 | 
 9 | [deploy]
10 | ;url = http://ssh.leepycode.com:6800/
11 | project = bilibili
12 | 


--------------------------------------------------------------------------------