├── .gitattributes ├── .gitignore ├── README.md ├── bilibili ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── redis_n.py ├── settings.py ├── spiders │ ├── __init__.py │ └── bilibili_spider.py ├── start.py └── useragent.py ├── get_aid.py ├── requirements.txt └── scrapy.cfg /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bilibili_video_stat 2 | 爬取b站视频信息,供大数据分析用户喜好。使用scrapy-redis分布式,在16核服务器上实现抓取2500万条/天。可长期部署抓取,实现视频趋势分析 3 | 4 | - 1.提供代理ip池 5 | - 2.提供user agent池 6 | - 3.使用scrapy-redis分布式 7 | - 4.使用mongodb保存数据(也可使用mysql,提供了相应代码) 8 | - 5.使用多进程(单个爬虫线程数请根据设备情况自行在settings.py中修改) 9 | - 6.提供完整error记录 10 | - 7.提供自定义重试中间件UserRetryMiddleware(如使用,请在settings.py中修改) 11 | - 8.提供在redis中添加starurls的代码,请阅读redis_n.py文件 12 | 13 | ### 启动命令 14 | 15 | ```shell 16 | pip3 install -r requirements.txt 17 | python3 start.py 进程数 18 | # python3 start.py 32 19 | ``` 20 | 21 | - 建议配合代理ip池使用,提供 https://github.com/arthurmmm/hq-proxies 代理池的调用中间件,需在settings中开启中间件 22 | 23 | - 请在使用前设置settings.py中的redis和mongodb的连接 24 | 25 | - pipeline.py注释了使用mysql保存的代码,如使用,请在settings.py添加相应设置 26 | 27 | ## 千万级大数据量/天的爬取经验: 28 | 29 | - 1.请准备较大的代理IP池:若数据量在400条/秒,根据b站封ip的测试(短时间请求1000次则封ip),代理池需维持300-400的代理ip 30 | 31 | - 2.请使用短有效期的代理ip:代理ip有效期在3-5分钟需300-400的代理池IP量,若有效期较长,需增加代理池的ip量 32 | 33 | - 3.请准备较多的user agent组成池,useragent.py中的量差不多了 34 | 35 | - 4.请开启重试:超时时间调小为10s,重试次数增加为10;实际测试中,重试1次约占30%,重试2次及以上约占10%,最多的重试了6次;重试占比于代理ip质量相关 36 | 37 | - 5.若出现所有请求全部都重试的情况,请开启DOWNLOAD_DELAY = 0.01;实际测试,即使只延迟0.01s也会显著减少重试次数;若重试情况仍严重,请增加至0.25s 38 | 39 | - 6.需详细记录失败的url,保持数据的完整性 40 | -------------------------------------------------------------------------------- /bilibili/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Wangler2333/bilibili_video_stat/c0814ea06920a5ca8fdb4504c09995d0ee16c9f4/bilibili/__init__.py -------------------------------------------------------------------------------- /bilibili/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BilibiliItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | 17 | class BiliBiliData(scrapy.Item): 18 | aid = scrapy.Field() 19 | view = scrapy.Field() 20 | danmaku = scrapy.Field() 21 | reply = scrapy.Field() 22 | favorite = scrapy.Field() 23 | coin = scrapy.Field() 24 | share = scrapy.Field() 25 | time = scrapy.Field() 26 | -------------------------------------------------------------------------------- /bilibili/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | from redis import StrictRedis 8 | from scrapy.conf import settings 9 | from scrapy import signals 10 | import random # 随机选择 11 | from .useragent import agents # 导入前面的 12 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware # UserAegent中间件 13 | from scrapy.downloadermiddlewares.retry import RetryMiddleware # 重写重试中间件 14 | 15 | 16 | class UserRetryMiddleware(RetryMiddleware): 17 | # 自定义重试中间件,未启用 18 | def _retry(self, request, reason, spider): 19 | redis_db = StrictRedis( 20 | host=settings["REDIS_HOST"], 21 | port=settings["REDIS_PORT"], 22 | password=settings["REDIS_PASSWORD"], 23 | db=settings["REDIS_PROXY_DB"], 24 | ) 25 | print(request.url) 26 | redis_db.lpush("bilibili_spider:strat_urls", request.url) 27 | 28 | 29 | class UserAgentmiddleware(UserAgentMiddleware): 30 | # 随机user agent的中间件 31 | def process_request(self, request, spider): 32 | agent = random.choice(agents) 33 | request.headers["User-Agent"] = agent 34 | 35 | 36 | class BilibiliDownloaderMiddleware(object): 37 | @classmethod 38 | def from_crawler(cls, crawler): 39 | s = cls() 40 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 41 | crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) 42 | crawler.signals.connect(s.spider_error, signal=signals.spider_error) 43 | return s 44 | 45 | def process_request(self, request, spider): 46 | # 使用代理ip池的中间件 47 | # 配合https://github.com/arthurmmm/hq-proxies使用 48 | # 若使用请在settings中开启 49 | redis_db = StrictRedis( 50 | host=settings["REDIS_HOST"], 51 | port=settings["REDIS_PORT"], 52 | password=settings["REDIS_PASSWORD"], 53 | db=settings["REDIS_PROXY_DB"], 54 | ) 55 | 56 | proxy = redis_db.srandmember("hq-proxies:proxy_pool", 1) 57 | if proxy: 58 | proxy = proxy[0].decode() 59 | spider.logger.info('使用代理[%s]访问[%s]' % (proxy, request.url)) 60 | 61 | request.meta['proxy'] = proxy 62 | else: 63 | spider.logger.warning('不使用代理访问[%s]' % request.url) 64 | return None 65 | 66 | def process_response(self, request, response, spider): 67 | return response 68 | 69 | def process_exception(self, request, exception, spider): 70 | pass 71 | 72 | def spider_opened(self, spider): 73 | spider.logger.info('爬虫开启: %s' % spider.name) 74 | 75 | def spider_closed(self, spider): 76 | spider.logger.info('爬虫关闭: %s' % spider.name) 77 | 78 | def spider_error(self, failure, response, spider): 79 | # 增加记录爬虫报错的函数,连接spider_error信号 80 | spider.logger.error('[%s],错误:%s' % (response.url, failure.getTraceback())) 81 | with open("./error_spider.txt", "a") as fa: 82 | fa.write(response.url) 83 | fa.write("\n") 84 | with open("./error_spider_info.txt", "a") as fb: 85 | fb.write("Error on {0}, traceback: {1}".format(response.url, failure.getTraceback())) 86 | fb.write("\n") 87 | -------------------------------------------------------------------------------- /bilibili/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # import pymysql 8 | from scrapy.conf import settings 9 | import logging as logger 10 | import time 11 | from pymongo import MongoClient 12 | 13 | 14 | class BilibiliPipeline(object): 15 | def __init__(self): 16 | # 建立数据库连接 17 | client = MongoClient(settings["MONGO_HOST"], settings["MONGO_PORT"]) 18 | # 连接目标数据库 19 | db = client["bilibili"] 20 | db.authenticate(settings["MONGO_USERNAME"], settings["MONGO_PASSWORD"]) 21 | # 连接集合 22 | # 根据当前日期建立集合 23 | col_name = "b_video_stat_" + time.strftime("%Y%m%d") 24 | col = db[col_name] 25 | 26 | self.col = col 27 | 28 | def process_item(self, item, spider): 29 | try: 30 | data = dict(item) 31 | self.col.insert_one(data) 32 | except Exception as error: 33 | # 记录保存错误的url 34 | logger.error(error) 35 | with open("./error_mongo.txt", "a") as fb: 36 | fb.write("aid:" + str(item["aid"])) 37 | fb.write("\n") 38 | return item 39 | 40 | # 保存到mysql 41 | # def process_item(self, item, spider): 42 | # # 连接数据库 43 | # connect = pymysql.connect( 44 | # host=settings.MYSQL_HOST, 45 | # db=settings.MYSQL_DBNAME, 46 | # user=settings.MYSQL_USER, 47 | # passwd=settings.MYSQL_PASSWD, 48 | # charset='utf8mb4', 49 | # use_unicode=True) 50 | # 51 | # # 通过cursor执行增删查改 52 | # cursor = connect.cursor() 53 | # try: 54 | # cursor.execute('''insert into b_video_stat(aid,view,danmaku,reply,favorite,coin,share) 55 | # values (%d,%d,%d,%d,%d,%d,%d)''' % (item["aid"], 56 | # item["view"], 57 | # item["danmaku"], 58 | # item["reply"], 59 | # item["favorite"], 60 | # item["coin"], 61 | # item["share"])) 62 | # # 提交sql语句 63 | # connect.commit() 64 | # except Exception as error: 65 | # # 出现错误时打印错误日志 66 | # logging.error((error, item)) 67 | # 68 | # connect.close() 69 | # return item 70 | -------------------------------------------------------------------------------- /bilibili/redis_n.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # @version: 4 | # @author: liduo 5 | # @license: 6 | # @file: redis_n.py 7 | # @time: 2018/5/30 下午5:05 8 | from redis import StrictRedis 9 | import settings 10 | from concurrent.futures import ThreadPoolExecutor 11 | 12 | 13 | def run(t): 14 | # 向redis传入strat_urls 15 | start = t[0] 16 | stop = t[1] 17 | redis_db = StrictRedis( 18 | host=settings.REDIS_HOST, 19 | port=settings.REDIS_PORT, 20 | password=settings.REDIS_PASSWORD, 21 | db=0, 22 | ) 23 | 24 | for i in range(start, stop): 25 | print(i) 26 | 27 | redis_db.lpush("bilibili_spider:strat_urls", "https://api.bilibili.com/x/web-interface/archive/stat?aid=%s" % i) 28 | 29 | 30 | def main(start, stop, step): 31 | # 开启多线程 32 | num_list = [] 33 | # 为每个线程分配aid,组成列表 34 | for i in range(start, stop, step): 35 | start_i = i 36 | stop_i = i + step 37 | t = (start_i, stop_i) 38 | num_list.append(t) 39 | # 设置线程池大小,或动态生成大小 40 | pool_num = 32 41 | # pool_num = (stop - start) // step 42 | with ThreadPoolExecutor(pool_num) as executor: 43 | # 使用map动态生成线程 44 | executor.map(run, num_list) 45 | 46 | 47 | if __name__ == '__main__': 48 | import sys 49 | 50 | # 接收参数,start,stop,step 51 | # start:起始aid 52 | # stop:结束aid 53 | # step:每个线程负责aid的数量 54 | start_num = int(sys.argv[1]) 55 | stop_num = int(sys.argv[2]) 56 | step_num = int(sys.argv[3]) 57 | main(start_num, stop_num, step_num) 58 | -------------------------------------------------------------------------------- /bilibili/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # BOT_NAME = 'bilibili' 4 | 5 | SPIDER_MODULES = ['bilibili.spiders'] 6 | NEWSPIDER_MODULE = 'bilibili.spiders' 7 | 8 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 9 | # USER_AGENT = 'bilibili (+http://www.yourdomain.com)' 10 | 11 | # Obey robots.txt rules 12 | # ROBOTSTXT_OBEY = True 13 | 14 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 15 | # 爬虫并发线程数,根据机器性能修改 16 | CONCURRENT_REQUESTS = 1024 17 | 18 | # Configure a delay for requests for the same website (default: 0) 19 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 20 | # See also autothrottle settings and docs 21 | # DOWNLOAD_DELAY = 0.01 22 | # The download delay setting will honor only one of: 23 | # 每个domain并发量,设置为超大值,代表不限制 24 | CONCURRENT_REQUESTS_PER_DOMAIN = 100000000 25 | # 单个ip并发量,设置为0,代表不限制 26 | CONCURRENT_REQUESTS_PER_IP = 0 27 | 28 | # Disable cookies (enabled by default) 29 | # 禁用cookies 30 | COOKIES_ENABLED = False 31 | 32 | # Disable Telnet Console (enabled by default) 33 | # 不使用Telnet远程调试 34 | TELNETCONSOLE_ENABLED = False 35 | 36 | # Override the default request headers: 37 | DEFAULT_REQUEST_HEADERS = { 38 | # 设置默认user agent 39 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 40 | "Accept-Language": "zh-CN,zh;q=0.9", 41 | } 42 | 43 | DOWNLOADER_MIDDLEWARES = { 44 | # 重试 45 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 530, 46 | # 代理ip 47 | # 'bilibili.middlewares.BilibiliDownloaderMiddleware': 540, 48 | # 随机user agent 49 | 'bilibili.middlewares.UserAgentmiddleware': 544, 50 | } 51 | 52 | ITEM_PIPELINES = { 53 | 'bilibili.pipelines.BilibiliPipeline': 300, 54 | # 若希望保存item在redis中则打开RedisPipeline 55 | # 'scrapy_redis.pipelines.RedisPipeline': 310, 56 | } 57 | 58 | # 允许所有状态码通过 59 | # HTTPERROR_ALLOW_ALL = True 60 | 61 | # 重试次数 62 | RETRY_ENABLED = True 63 | RETRY_TIMES = 10 64 | 65 | # 下载超时 66 | DOWNLOAD_TIMEOUT = 10 67 | 68 | # log日志设置 69 | LOG_FILE = 'mySpider.log' 70 | LOG_LEVEL = 'WARNING' 71 | 72 | # mongodb设置 73 | MONGO_HOST = "ip" 74 | MONGO_PORT = 27017 75 | MONGO_USERNAME = "user" # 若无为None 76 | MONGO_PASSWORD = "password" # 若无为None 77 | 78 | ######################################################################### 79 | # Scrapy-Rdis配置项 80 | # 启用Redis调度存储请求队列 81 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 82 | 83 | # 确保所有的爬虫通过Redis去重 84 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 85 | 86 | # 默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS:这玩意儿2.X的可以用。3.X的不能用 87 | # SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" 88 | 89 | # 不清除Redis队列、这样可以暂停/恢复 爬取 90 | # SCHEDULER_PERSIST = True 91 | 92 | # 使用优先级调度请求队列 (默认使用) 93 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 94 | # 可选用的其它队列 95 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' 96 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' 97 | 98 | # 最大空闲时间防止分布式爬虫因为等待而关闭 99 | # 这只有当上面设置的队列类是SpiderQueue或SpiderStack时才有效 100 | # 并且当您的蜘蛛首次启动时,也可能会阻止同一时间启动(由于队列为空) 101 | # SCHEDULER_IDLE_BEFORE_CLOSE = 10 102 | 103 | 104 | # 序列化项目管道作为redis Key存储 105 | # REDIS_ITEMS_KEY = '%(spider)s:items' 106 | 107 | # 默认使用ScrapyJSONEncoder进行项目序列化 108 | # You can use any importable path to a callable object. 109 | # REDIS_ITEMS_SERIALIZER = 'json.dumps' 110 | 111 | # 指定连接到redis时使用的端口和地址(可选) 112 | # 与代理ip公用设置 113 | REDIS_HOST = 'ip' 114 | REDIS_PORT = 6379 115 | REDIS_PASSWORD = 'password' # 无密码为None 116 | REDIS_PROXY_DB = 10 117 | 118 | # 指定用于连接redis的URL(可选) 119 | # 如果设置此项,则此项优先级高于设置的REDIS_HOST 和 REDIS_PORT 120 | REDIS_URL = 'redis://user:password@ip:6379' 121 | 122 | # 自定义的redis参数(连接超时之类的) 123 | # REDIS_PARAMS = {} 124 | 125 | # 自定义redis客户端类 126 | # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' 127 | 128 | # 如果为True,则使用redis的'spop'进行操作。 129 | # 如果需要避免起始网址列表出现重复,这个选项非常有用。开启此选项urls必须通过sadd添加,否则会出现类型错误。 130 | # REDIS_START_URLS_AS_SET = False 131 | 132 | # RedisSpider和RedisCrawlSpider默认 start_usls 键 133 | # REDIS_START_URLS_KEY = '%(name)s:start_urls' 134 | 135 | # 设置redis使用utf-8之外的编码 136 | # REDIS_ENCODING = 'latin1' 137 | -------------------------------------------------------------------------------- /bilibili/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bilibili/spiders/bilibili_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy_redis.spiders import RedisSpider 3 | import logging 4 | from scrapy.exceptions import CloseSpider 5 | from bilibili.items import BiliBiliData 6 | import json 7 | import time 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class BilibiliSpiderSpider(RedisSpider): 13 | name = 'bilibili_spider' 14 | # allowed_domains = ['bilibili.com'] 15 | # 启动爬虫的命令 16 | redis_key = "bilibili_spider:strat_urls" 17 | 18 | def parse(self, response): 19 | try: 20 | # 若settings中HTTPERROR_ALLOW_ALL = True,则需检测状态吗 21 | if response.status not in [200, 301, 302, 303, 307]: 22 | raise CloseSpider("网址:%s 状态码异常:%s" % (response.url, response.status)) 23 | except CloseSpider as error: 24 | logger.error(error) 25 | else: 26 | try: 27 | # 解析json数据 28 | json_data = json.loads(response.text) 29 | except Exception as error: 30 | # 若解析错误,记录url 31 | json_data = {"code": 403} 32 | logger.error((response.url, error)) 33 | with open("./error_json.txt", "a") as fb: 34 | fb.write(response.url) 35 | fb.write("\n") 36 | 37 | item = BiliBiliData() 38 | if json_data["code"] == 0: 39 | # 解析json数据,若为"--"则计为0 40 | data = json_data["data"] 41 | item['aid'] = data.get("aid") 42 | item['view'] = data.get("view", 0) if data.get("view", 0) != "--" else 0 43 | item['danmaku'] = data.get("danmaku", 0) if data.get("danmaku", 0) != "--" else 0 44 | item['reply'] = data.get("reply", 0) if data.get("reply", 0) != "--" else 0 45 | item['favorite'] = data.get("favorite", 0) if data.get("favorite", 0) != "--" else 0 46 | item['coin'] = data.get("coin", 0) if data.get("coin", 0) != "--" else 0 47 | item['share'] = data.get("share", 0) if data.get("share", 0) != "--" else 0 48 | item['time'] = time.time() 49 | 50 | yield item 51 | 52 | logger.info("爬取完成:%s" % response.url) 53 | # 因logging等级设为了WARNING,则在log中增加一条完成记录 54 | logger.warning("完成:[%s]" % response.url) 55 | -------------------------------------------------------------------------------- /bilibili/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # @version: 4 | # @author: liduo 5 | # @license: 6 | # @file: start.py 7 | # @time: 2018/5/30 下午10:24 8 | from multiprocessing import Pool 9 | import os 10 | 11 | 12 | def run(): 13 | # 发送命令,启动一个爬虫 14 | cmd = "scrapy crawl bilibili_spider" 15 | os.system(cmd) 16 | 17 | 18 | def main(number): 19 | # 创建进程池 20 | p = Pool(number) 21 | for n in range(number): 22 | p.apply_async(run) 23 | p.close() 24 | p.join() 25 | 26 | 27 | if __name__ == '__main__': 28 | import sys 29 | 30 | # 接收传入的参数,代表开启几个scrapy-redis进程 31 | num = sys.argv[1] 32 | num = int(num) 33 | print("开启[%s]个进程" % num) 34 | main(num) 35 | print("进程结束") 36 | -------------------------------------------------------------------------------- /bilibili/useragent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # @version: 4 | # @author: liduo 5 | # @license: 6 | # @file: useragent.py 7 | # @time: 2018/5/30 下午3:56 8 | agents = [ 9 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 10 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 11 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 12 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 13 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 14 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 15 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 16 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 17 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 19 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 20 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 21 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 22 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 23 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 24 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 25 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 26 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 27 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 28 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 29 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 31 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 32 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 33 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 34 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 35 | "Mozilla/2.02E (Win95; U)", 36 | "Mozilla/3.01Gold (Win95; I)", 37 | "Mozilla/4.8 [en] (Windows NT 5.1; U)", 38 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 39 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 41 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 42 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 43 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 44 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 45 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 46 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 47 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 48 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 49 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 50 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 51 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 52 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 53 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 54 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 55 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 56 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 57 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 58 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 59 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 60 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 61 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 62 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 63 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 64 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 65 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 66 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 67 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 68 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 69 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 70 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 71 | ] 72 | 73 | if __name__ == '__main__': 74 | pass 75 | -------------------------------------------------------------------------------- /get_aid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # @version: 4 | # @author: liduo 5 | # @license: 6 | # @file: get_aid.py 7 | # @time: 2018/6/3 下午5:05 8 | from pymongo import MongoClient 9 | 10 | MONGO_HOST = "ssh.leepycode.com" 11 | MONGO_PORT = 27017 12 | 13 | # 建立数据库连接 14 | client = MongoClient(MONGO_HOST, MONGO_PORT) 15 | # 连接目标数据库 16 | db = client["bilibili_data"] 17 | # 连接集合 18 | col_name = "b_video_stat_" + "20180601" 19 | col = db[col_name] 20 | fb = open("./aid.txt", "a") 21 | 22 | 23 | def run(aid): 24 | data = col.find({"aid": aid}) 25 | try: 26 | if data[0]: 27 | fb.write(str(aid)) 28 | fb.write("\n") 29 | except: 30 | pass 31 | 32 | 33 | def main(start, stop): 34 | for n in range(start, stop): 35 | print(n) 36 | run(n) 37 | fb.close() 38 | 39 | 40 | if __name__ == '__main__': 41 | main(0, 25000000) 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pymongo==3.6.1 2 | PyMySQL==0.8.1 3 | redis==2.10.6 4 | Scrapy==1.5.0 5 | scrapy-redis==0.6.8 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bilibili.settings 8 | 9 | [deploy] 10 | ;url = http://ssh.leepycode.com:6800/ 11 | project = bilibili 12 | --------------------------------------------------------------------------------