├── spider ├── tools │ ├── __init__.py │ ├── db.py │ ├── statsd.py │ ├── task.py │ └── common.py ├── publish.py ├── config │ ├── __init__.py │ ├── sites.yaml │ ├── params.yaml │ ├── logging.yaml │ └── conf.py ├── pull │ ├── __init__.py │ └── you_get.py ├── __init__.py ├── extract │ ├── __init__.py │ ├── bilibili.py │ └── miaopai.py ├── models │ ├── __init__.py │ ├── tables.sql │ └── videos.py ├── celeryconfig.py ├── download.py └── parse.py ├── doc ├── parker.png └── jiankong.png ├── requirements.txt ├── .gitignore └── README.md /spider/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spider/publish.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """发布下载的视频""" 3 | -------------------------------------------------------------------------------- /spider/config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置""" 3 | -------------------------------------------------------------------------------- /spider/pull/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """拉取视频""" 3 | -------------------------------------------------------------------------------- /doc/parker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuRoy/parker/HEAD/doc/parker.png -------------------------------------------------------------------------------- /doc/jiankong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuRoy/parker/HEAD/doc/jiankong.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | celery==4.0.2 2 | you-get==0.4.652 3 | PyYAML==3.12 4 | requests==2.13.0 5 | pyquery==1.2.17 6 | SQLAlchemy==1.1.6 7 | PyMySQL==0.7.10 8 | statsd==3.2.1 9 | -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf8 -*- 2 | """爬虫""" 3 | 4 | from celery import Celery 5 | 6 | app = Celery("spider") 7 | app.config_from_object("spider.celeryconfig") 8 | -------------------------------------------------------------------------------- /spider/extract/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """解析页面获取视频地址""" 3 | 4 | from . import ( 5 | bilibili, 6 | miaopai, 7 | ) 8 | 9 | __all__ = ['bilibili', 'miaopai'] 10 | -------------------------------------------------------------------------------- /spider/tools/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import sessionmaker 2 | from sqlalchemy.engine import create_engine 3 | 4 | 5 | def make_session(db_url): 6 | """根据数据库配置生成会话对象""" 7 | engine = create_engine(db_url) 8 | return sessionmaker(bind=engine) 9 | -------------------------------------------------------------------------------- /spider/tools/statsd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """假监控客户端""" 3 | 4 | 5 | class FakeStatsdClient(object): 6 | """假客户端""" 7 | def __init__(self, *args, **kwargs): 8 | pass 9 | 10 | def incr(self, *args, **kwargs): 11 | pass 12 | -------------------------------------------------------------------------------- /spider/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """存储数据库""" 3 | from spider.tools.db import make_session 4 | from spider.config.conf import params 5 | from sqlalchemy.ext.declarative import declarative_base 6 | 7 | 8 | BaseModel = declarative_base() 9 | DBSession = make_session(params['mysql_url']) 10 | -------------------------------------------------------------------------------- /spider/config/sites.yaml: -------------------------------------------------------------------------------- 1 | sites: 2 | - name: bilibili-1 3 | url: http://space.bilibili.com/21474566/#!/video # 超级镜子 4 | task: spider.parse.bilibili 5 | minute: 1 6 | - name: miaopai-2 7 | url: http://m.miaopai.com/v2_index/u/paike_5no3e2iw6g # 陈翔六点半 8 | task: spider.parse.miaopai 9 | minute: 1 10 | -------------------------------------------------------------------------------- /spider/config/params.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | - mode: release 3 | broker_url: amqp://guest:guest@192.168.99.100:5672/parker 4 | mysql_url: mysql+pymysql://root:root@192.168.99.100:3306/parker?charset=utf8mb4 5 | download_path: /home/liuruoyu/Desktop/github/parker 6 | statsd_address: 192.168.99.100:8125 7 | video_number_per_page: 1 8 | download_timeout: 3600 9 | -------------------------------------------------------------------------------- /spider/celeryconfig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """celery配置""" 3 | 4 | from spider.config.conf import ( 5 | load_sites, 6 | params 7 | ) 8 | 9 | BROKER_URL = params['broker_url'] 10 | 11 | CELERY_TIMEZONE = 'Asia/Shanghai' 12 | 13 | CELERY_IMPORTS = ( 14 | 'spider.parse', 15 | 'spider.download', 16 | ) 17 | 18 | CELERYBEAT_SCHEDULE = load_sites() 19 | 20 | CELERY_TASK_SERIALIZER = "pickle" 21 | CELERY_ACCEPT_CONTENT = ['pickle', 'json'] 22 | -------------------------------------------------------------------------------- /spider/tools/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """celery base task""" 3 | import celery 4 | from spider.config.conf import logger 5 | 6 | 7 | class ParkerTask(celery.Task): 8 | def on_failure(self, exc, task_id, args, kwargs, einfo): 9 | logger.error("task {} error".format(task_id)) 10 | logger.exception(exc) 11 | return super(ParkerTask, self).on_failure( 12 | exc, task_id, args, kwargs, einfo) 13 | 14 | def on_success(self, retval, task_id, args, kwargs): 15 | logger.info("task {} done".format(task_id)) 16 | return super(ParkerTask, self).on_success( 17 | retval, task_id, args, kwargs) 18 | -------------------------------------------------------------------------------- /spider/download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """下载视频任务""" 3 | from spider import app 4 | from spider.tools.task import ParkerTask 5 | from spider.pull.you_get import ( 6 | get_video_info, 7 | download_video 8 | ) 9 | 10 | 11 | @app.task(base=ParkerTask) 12 | def bilibili(video): 13 | """根据bilibili播放地址下载视频 14 | 15 | Args: 16 | video (Videos): 视频记录 17 | """ 18 | video_info = get_video_info(video.video_url, video.task_id) 19 | download_video(video_info) 20 | 21 | 22 | @app.task(base=ParkerTask) 23 | def miaopai(video): 24 | """根据miaopai播放地址下载视频 25 | 26 | Args: 27 | video (Videos): 视频记录 28 | """ 29 | video_info = get_video_info(video.video_url, video.task_id) 30 | download_video(video_info) 31 | -------------------------------------------------------------------------------- /spider/parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """解析页面任务""" 3 | from spider import app 4 | from spider import download 5 | from spider.tools.task import ParkerTask 6 | from spider import extract 7 | 8 | 9 | @app.task(base=ParkerTask) 10 | def bilibili(url, name): 11 | """抓取哔哩哔哩 解析获取最新视频地址 12 | 13 | Args: 14 | url (string): 哔哩哔哩页面地址 15 | name (string): 定时任务名称 16 | """ 17 | new_videos = extract.bilibili.extract_videos(url, name) 18 | if new_videos: 19 | for video in new_videos: 20 | download.bilibili.delay(video) 21 | 22 | 23 | @app.task(base=ParkerTask) 24 | def miaopai(url, name): 25 | """抓取秒拍页面 解析获取最新视频地址 26 | 27 | Args: 28 | url (string): 美拍页面地址 29 | name (string): 定时任务名称 30 | """ 31 | new_videos = extract.miaopai.extract_videos(url, name) 32 | if new_videos: 33 | for video in new_videos: 34 | download.miaopai.delay(video) 35 | -------------------------------------------------------------------------------- /spider/config/logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: no 3 | 4 | formatters: 5 | simple: 6 | format: '%(asctime)s [%(levelname)s] [%(process)d] %(filename)s-%(lineno)d: %(message)s' 7 | detail: 8 | format: '%(asctime)s [%(levelname)s] [%(process)d] %(pathname)s-%(lineno)d: %(message)s' 9 | 10 | handlers: 11 | console: 12 | class: logging.StreamHandler 13 | level: DEBUG 14 | formatter: detail 15 | stream: ext://sys.stdout 16 | files: 17 | class: logging.handlers.WatchedFileHandler 18 | level: DEBUG 19 | formatter: simple 20 | filename: /home/liuruoyu/Desktop/github/parker/worker.log 21 | 22 | loggers: 23 | parker.debug: 24 | level: DEBUG 25 | handlers: [console] 26 | propagate: no 27 | qualname: parker.debug 28 | parker.release: 29 | level: INFO 30 | handlers: [files] 31 | propagate: no 32 | qualname: parker.release 33 | 34 | root: 35 | level: DEBUG 36 | handlers: [console] 37 | -------------------------------------------------------------------------------- /spider/models/tables.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA `parker` DEFAULT CHARACTER SET utf8mb4 ; 2 | 3 | CREATE TABLE `download_info` ( 4 | `id` int(11) NOT NULL AUTO_INCREMENT, 5 | `video_id` int(11) NOT NULL COMMENT '视频id', 6 | `video_url` varchar(200) NOT NULL COMMENT '播放url', 7 | `video_title` varchar(200) NOT NULL COMMENT '视频标题', 8 | `video_size` int(11) NOT NULL DEFAULT '0' COMMENT '视频大小', 9 | `status` tinyint(4) NOT NULL DEFAULT '0' COMMENT '是否下载完成 1下载完成 0未下载', 10 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 11 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 12 | PRIMARY KEY (`id`), 13 | KEY `ix_video_id` (`video_id`) 14 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='视频下载信息表'; 15 | 16 | CREATE TABLE `web_video` ( 17 | `id` int(11) NOT NULL AUTO_INCREMENT, 18 | `source` varchar(10) NOT NULL COMMENT '网站类型', 19 | `task_id` int(11) NOT NULL COMMENT '任务id', 20 | `img_url` varchar(200) NOT NULL COMMENT '视频封面链接', 21 | `duration` int(11) NOT NULL COMMENT '视频时长', 22 | `title` varchar(200) NOT NULL DEFAULT '' COMMENT '视频标题', 23 | `video_url` varchar(200) NOT NULL COMMENT '视频播放页面', 24 | `video_url_md5` varchar(32) NOT NULL COMMENT '视频播放页面', 25 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 26 | PRIMARY KEY (`id`), 27 | KEY `ix_video_url_md5` (`video_url_md5`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='网站视频信息'; 29 | -------------------------------------------------------------------------------- /spider/config/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """解析配置文件""" 3 | import os 4 | import yaml 5 | import logging 6 | import logging.config 7 | from datetime import timedelta 8 | 9 | import statsd 10 | from spider.tools.statsd import FakeStatsdClient 11 | 12 | current_dir = os.path.dirname(__file__) 13 | sites_path = os.path.join(current_dir, 'sites.yaml') 14 | params_path = os.path.join(current_dir, 'params.yaml') 15 | logging_path = os.path.join(current_dir, 'logging.yaml') 16 | 17 | with open(logging_path, 'r') as f: 18 | logging.config.dictConfig(yaml.load(f)) 19 | 20 | 21 | def load_sites(): 22 | """解析sites.yaml 生成CELERYBEAT_SCHEDULE""" 23 | with open(sites_path, 'r') as f: 24 | sites = yaml.load(f) 25 | return {x['name']: { 26 | 'task': x['task'], 27 | 'schedule': timedelta(minutes=int(x['minute'])), 28 | 'args': (x['url'], x['name']) 29 | } for x in sites['sites']} 30 | 31 | 32 | def load_params(): 33 | """解析params.yaml""" 34 | with open(params_path, 'r') as f: 35 | p = yaml.load(f) 36 | return p['params'][0] 37 | 38 | 39 | params = load_params() 40 | if params['mode'] == 'debug': 41 | logger = logging.getLogger('parker.debug') 42 | statsd_client = FakeStatsdClient() 43 | else: 44 | logger = logging.getLogger('parker.release') 45 | statsd_host, statsd_port = params['statsd_address'].split(':') 46 | statsd_client = statsd.StatsClient( 47 | host=statsd_host, port=int(statsd_port), prefix='parker') 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | .venv/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | .idea/ 95 | -------------------------------------------------------------------------------- /spider/tools/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """一些公用结构体""" 3 | import hashlib 4 | from collections import namedtuple 5 | 6 | 7 | WebVideo = namedtuple( 8 | 'WebVideo', [ 9 | 'source', # string 网站类型 10 | 'task_id', # int 定时任务id 11 | 'img_url', # string 图片链接 12 | 'duration', # int 播放时长 13 | 'title', # string 视频标题 14 | 'video_url', # string 视频链接 15 | 'video_url_md5', # string 视频链接md值 16 | ]) 17 | 18 | VideoInfo = namedtuple( 19 | 'VideoInfo', [ 20 | 'video_id', # int Videos记录id 21 | 'video_url', # string 播放url 22 | 'title', # string 视频标题 23 | 'size', # int 视频大小 24 | ]) 25 | 26 | 27 | def parse_task(task_name): 28 | """解析定时任务名称获取任务类型和id 29 | 30 | Args: 31 | task_name (string): 任务名称 32 | Returns: 33 | source (string): 网站类型 34 | task_id int: 任务id 35 | """ 36 | source, task_id = task_name.split('-') 37 | return source, int(task_id) 38 | 39 | 40 | def parse_video_time(v_time): 41 | """解析视频时长 42 | 43 | Args: 44 | v_time (string): 视频时长 格式: 4:50:89 01:29 45 | Returns: 46 | int 视频时长秒数 47 | """ 48 | items = v_time.split(':') 49 | if len(items) == 2: 50 | return int(items[0]) * 60 + int(items[1]) 51 | 52 | if len(items) == 3: 53 | return int(items[0]) * 3600 + int(items[1]) * 60 + int(items[2]) 54 | 55 | return 0 56 | 57 | 58 | def get_md5(content): 59 | """计算md5 60 | 61 | Args: 62 | content (string): 要计算md5的字符串 63 | Returns: 64 | string: 计算好的md5值 65 | """ 66 | md5 = hashlib.md5() 67 | md5.update(content.encode('utf-8')) 68 | return md5.hexdigest() 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parker 2 | 3 | parker短视频爬虫项目(项目地址:[https://github.com/LiuRoy/parker](https://github.com/LiuRoy/parker)),它采用celery框架定时爬取用户视频列表,将最新发布的视频通过you-get异步下载,可以很方便地实现分布式部署。因为各个网站的页面布局和接口更新比较频繁,为了保证程序的高可用,特意增加了Statsd监控,方便及时发现出错。 4 | 5 | ## 代码架构 6 | 7 | ![代码架构](./doc/parker.png) 8 | 9 | 目前parker中只实现了B站和秒拍的下载,从框架图可以看出,针对每一类网站,需要实现两个异步接口:从用户视频主页解析发布视频的播放地址、根据播放地址下载视频。因此增加网站类型,不需要修改原来的代码,只需要添加新的解析和下载接口即可。针对视频下载完成之后的后续操作,我还没有实现,大家可以根据自己的需求自由的去实现。 10 | 11 | 在运行的时候,celery会将配置好的优质用户列表定时发送到对应网站的解析接口异步执行,筛选出最新发布的视频播放地址,交给对应的下载接口异步下载,下载完成之后再异步调用后续操作。因此需要启动一个celery beat进程发送定时任务,以及若干celery异步任务去执行解析和下载操作,对于比较大的视频,下载会相当耗时,建议根据任务列表的多少合理分配异步任务的个数。 12 | 13 | ## 程序运行 14 | 15 | 经验证,此程序可以在ubuntu和mac下正常运行, 由于本地windows下的celery无法正常启动,所以没有在windows环境做过验证。 16 | 17 | ### 依赖库安装 18 | 19 | python版本为3.5,进入项目目录后,执行: 20 | 21 | ```bash 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | ### 创建数据库表 26 | 27 | 提前在数据库中建好两张表(sql: [https://github.com/LiuRoy/parker/blob/master/spider/models/tables.sql](https://github.com/LiuRoy/parker/blob/master/spider/models/tables.sql)) 28 | 29 | ### 参数配置 30 | 31 | config路径下的logging.yaml、params.yaml、sites.yaml分别对应日志配置、运行参数配置、热门用户配置。 32 | 33 | #### 日志配置 34 | 35 | debug模式下日志会直接输出在标准输出流,release模式下会将日志内容输出到文件中,因此需要配置输出日志文件。 36 | 37 | #### 运行配置 38 | 39 | + mode debug调试模式,此模式下日志指向标准输出,并且没有监控数据;release模式下,日志输出到制定文件,并且有监控数据。 40 | + broker_url 对应于celery的BROKER_URL,可以配置为redis或者rabbitmq 41 | + mysql_url 数据库地址,需要提前建好两张表 42 | + download_path 视频下载路径 43 | + statsd_address 监控地址 44 | + video_number_per_page 每次从用户视频主页解析出多少条视频播放地址,因为大部分用户每次发布的视频个数很少,只需要设置成一个很小的值即可。在初次运行的时候,也不会下载大量久远的视频。 45 | + download_timeout 视频下载的超时时间 46 | 47 | #### 热门用户配置 48 | 49 | parker会根据此配置生成一份celery beat scheduler列表。 50 | 51 | + name 规则是`<网站类型>-<任务id>`,parker会根据此作为scheduler任务名称 52 | + url 用户的发布视频主页 53 | + task 对应的celery解析异步任务 54 | + minute 多少分钟检查一次用户视频列表 55 | 56 | ### 启动任务 57 | 58 | 进入项目目录,执行下面命令启动celery worker 59 | 60 | ```bash 61 | celery -A spider worker 62 | ``` 63 | 64 | 执行下面命令启动celery beat定时任务 65 | 66 | ```bash 67 | celery -A spider beat 68 | ``` 69 | 70 | ## 监控 71 | 72 | 强烈安利一个docker镜像 [https://hub.docker.com/r/samuelebistoletti/docker-statsd-influxdb-grafana/](https://hub.docker.com/r/samuelebistoletti/docker-statsd-influxdb-grafana/),一分钟配好监控环境有木有。之后只需要添加执行成功和执行异常的打点数据,就可以方便的监控程序是否正常运行了。 73 | 74 | ![监控](./doc/jiankong.png) -------------------------------------------------------------------------------- /spider/extract/bilibili.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """解析bilibili页面""" 3 | import re 4 | import requests 5 | from spider.tools.common import ( 6 | WebVideo, 7 | get_md5, 8 | parse_task, 9 | parse_video_time, 10 | ) 11 | from spider.models.videos import Videos 12 | from spider.config.conf import ( 13 | params, 14 | logger, 15 | statsd_client, 16 | ) 17 | 18 | 19 | def get_user_id(url): 20 | """从个人视频页链接解析出用户id 21 | 22 | Args: 23 | url (string): 人视频页链接 24 | Returns: 25 | int: 用户id 26 | """ 27 | result = re.search(r'\d+', url) 28 | return int(result.group()) 29 | 30 | 31 | def get_video_lists(video_data, source, task_id): 32 | """从获取的视频数据中找到感兴趣的数据 33 | 34 | Args: 35 | video_data (dict): 视频数据 36 | source (string): 网站类型 37 | task_id (int): 任务id 38 | Returns: 39 | list 视频信息列表 40 | """ 41 | result = [] 42 | for item in video_data['data']['vlist']: 43 | v_time = item['length'] 44 | video_url = 'http://www.bilibili.com/video/av{}/'.format(item['aid']) 45 | 46 | result.append(WebVideo( 47 | source=source, 48 | task_id=task_id, 49 | img_url=item['pic'], 50 | duration=parse_video_time(v_time), 51 | title=item['title'], 52 | video_url=video_url, 53 | video_url_md5=get_md5(video_url) 54 | )) 55 | return result 56 | 57 | 58 | def extract_videos(url, name): 59 | """解析视频列表 60 | 61 | Args: 62 | url (string): 哔哩哔哩页面地址 63 | name (string): 定时任务名称 64 | """ 65 | user_id = get_user_id(url) 66 | source, task_id = parse_task(name) 67 | 68 | request_url = 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&' \ 69 | 'pagesize={}&tid=0&page=1&keyword=&order=senddate'.format( 70 | user_id, params['video_number_per_page']) 71 | try: 72 | response = requests.get(request_url, timeout=10) 73 | if not response.ok: 74 | statsd_client.incr('bilibili.extract.exc') 75 | logger.error('request failure. url:{} name:{}'.format(request_url, name)) 76 | return 77 | 78 | video_data = response.json() 79 | videos = get_video_lists(video_data, source, task_id) 80 | except Exception as exc: 81 | statsd_client.incr('bilibili.extract.exc') 82 | logger.error('request failure. url:{} name:{}'.format(request_url, name)) 83 | logger.exception(exc) 84 | else: 85 | statsd_client.incr('bilibili.extract.suc') 86 | logger.info('request success. url:{} name:{}'.format(request_url, name)) 87 | new_videos = Videos.filter_exist(videos) 88 | Videos.batch_add(new_videos) 89 | return new_videos 90 | -------------------------------------------------------------------------------- /spider/pull/you_get.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """用you-get库下载视频""" 3 | import re 4 | import subprocess 5 | 6 | from spider.config.conf import params 7 | from spider.tools.common import VideoInfo 8 | from spider.models.videos import DownloadInfo 9 | from spider.config.conf import ( 10 | logger, 11 | statsd_client, 12 | ) 13 | 14 | 15 | def parse_size(size_info): 16 | """解析视频大小信息 17 | 18 | Args: 19 | size_info (string): eg: 62.9 MiB (65951953 bytes) 20 | Returns: 21 | int 22 | """ 23 | result = re.search(r'(\d+) bytes', size_info) 24 | if result: 25 | return int(result.group(1)) 26 | return 0 27 | 28 | 29 | def get_video_info(play_url, video_id): 30 | """获取视频信息 31 | 32 | Args: 33 | play_url (string): 播放地址 34 | video_id (int): 视频id 35 | """ 36 | try: 37 | info_comd = "you-get -i {}".format(play_url) 38 | p = subprocess.Popen(info_comd, shell=True, stdout=subprocess.PIPE) 39 | p.wait() 40 | content = p.stdout.read() 41 | content = content.decode('utf-8') 42 | 43 | video_title = re.search(r'Title:\s+(.*?)\s+Type:', content).group(1) 44 | video_size = int(re.search(r'\((\d+) Bytes\)', content).group(1)) 45 | 46 | video_info = VideoInfo( 47 | video_id=video_id, 48 | video_url=play_url, 49 | title=video_title, 50 | size=video_size, 51 | ) 52 | logger.info(video_info) 53 | except Exception as exc: 54 | statsd_client.incr('youget.info.exc') 55 | logger.error('you-get info failure: url:{} video:{}'.format( 56 | play_url, video_id)) 57 | logger.error(exc) 58 | else: 59 | statsd_client.incr('youget.info.suc') 60 | logger.info('you-get info success: url:{} video:{}'.format( 61 | play_url, video_id)) 62 | DownloadInfo.add(video_info) 63 | return video_info 64 | 65 | 66 | def download_video(video_info): 67 | """下载视频 68 | 69 | Args: 70 | video_info (VideoInfo): 视频下载信息 71 | """ 72 | try: 73 | download_comd = "you-get {}".format(video_info.video_url) 74 | p = subprocess.Popen(download_comd, 75 | shell=True, cwd=params['download_path']) 76 | p.wait(int(params['download_timeout'])) 77 | if p.returncode != 0: 78 | raise Exception("download failed") 79 | except Exception as exc: 80 | statsd_client.incr('youget.download.exc') 81 | logger.error('you-get download failure: url:{} video:{}'.format( 82 | video_info.video_url, video_info.video_id)) 83 | logger.exception(exc) 84 | else: 85 | statsd_client.incr('youget.download.suc') 86 | logger.info('you-get download success: url:{} video:{}'.format( 87 | video_info.video_url, video_info.video_id)) 88 | DownloadInfo.update_status(video_info.video_id) 89 | -------------------------------------------------------------------------------- /spider/extract/miaopai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """获取没拍视频列表""" 3 | import re 4 | import os 5 | import requests 6 | from pyquery import PyQuery 7 | 8 | from spider.tools.common import ( 9 | WebVideo, 10 | get_md5, 11 | parse_task, 12 | ) 13 | from spider.models.videos import Videos 14 | from spider.config.conf import ( 15 | params, 16 | logger, 17 | statsd_client, 18 | ) 19 | 20 | 21 | def get_user_id(html): 22 | """从html中找到user_id 23 | 24 | Args: 25 | html (string): 页面 26 | Returns: 27 | string: 用户id 28 | """ 29 | result = re.search(r"var suid = '(.+?)';", html) 30 | return result.group(1) 31 | 32 | 33 | def get_video_lists(html, source, task_id): 34 | """从获取的视频数据中找到感兴趣的数据 35 | 36 | Args: 37 | html (string): 视频html页面数据 38 | source (string): 网站类型 39 | task_id (int): 任务id 40 | Returns: 41 | list 视频信息列表 42 | """ 43 | page = PyQuery(html) 44 | result = [] 45 | for item in page("div[class='card_wrapping']"): 46 | title = item.xpath("./div[@class='h_title']")[0].text 47 | img_url = item.xpath("./a/div")[0].get("data-url") 48 | play_data = os.path.basename(img_url) 49 | video_url = 'http://www.miaopai.com/show/{}.htm'.format( 50 | play_data.split('_')[0]) 51 | 52 | result.append(WebVideo( 53 | source=source, 54 | task_id=task_id, 55 | img_url=img_url, 56 | duration=0, 57 | title=title, 58 | video_url=video_url, 59 | video_url_md5=get_md5(video_url) 60 | )) 61 | return result 62 | 63 | 64 | def extract_videos(url, name): 65 | """解析视频列表 66 | 67 | Args: 68 | url (string): 哔哩哔哩页面地址 69 | name (string): 定时任务名称 70 | """ 71 | source, task_id = parse_task(name) 72 | try: 73 | response = requests.get(url, timeout=10) 74 | if not response.ok: 75 | statsd_client.incr('miaopai.extract.exc') 76 | logger.error('request failure. url:{} name:{}'.format(url, name)) 77 | return 78 | 79 | user_id = get_user_id(response.text) 80 | video_page_url = 'http://m.miaopai.com/show/getOwnerVideo?suid={}&page=1&per={}'.format( 81 | user_id, params['video_number_per_page']) 82 | response = requests.get(video_page_url, timeout=10) 83 | if not response.ok: 84 | statsd_client.incr('miaopai.extract.exc') 85 | logger.error('request failure. url:{} name:{}'.format(video_page_url, name)) 86 | return 87 | 88 | video_div = response.json()['msg'] 89 | videos = get_video_lists(video_div, source, task_id) 90 | except Exception as exc: 91 | statsd_client.incr('miaopai.extract.exc') 92 | logger.error('request failure. name:{}'.format(name)) 93 | logger.exception(exc) 94 | else: 95 | statsd_client.incr('miaopai.extract.suc') 96 | logger.info('request success. name:{}'.format(name)) 97 | new_videos = Videos.filter_exist(videos) 98 | Videos.batch_add(new_videos) 99 | return new_videos 100 | -------------------------------------------------------------------------------- /spider/models/videos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """视频存储""" 3 | import datetime 4 | from sqlalchemy import ( 5 | Column, 6 | String, 7 | Integer, 8 | SmallInteger, 9 | DateTime, 10 | ) 11 | from spider.models import ( 12 | DBSession, 13 | BaseModel, 14 | ) 15 | 16 | 17 | class Videos(BaseModel): 18 | 19 | __tablename__ = 'web_video' 20 | 21 | id = Column(Integer, primary_key=True) 22 | source = Column(String(10), nullable=False) 23 | task_id = Column(Integer, nullable=False) 24 | img_url = Column(String(200), nullable=False) 25 | duration = Column(Integer, nullable=False) 26 | title = Column(String(200), nullable=False) 27 | video_url = Column(String(200), nullable=False) 28 | video_url_md5 = Column(String(32), nullable=False) 29 | created_at = Column(DateTime, nullable=False, default=datetime.datetime.now) 30 | 31 | @classmethod 32 | def filter_exist(cls, videos): 33 | """将已经存在表中的数据滤除 34 | 35 | Args: 36 | videos (list): video 下载链接 37 | Returns: 38 | list 返回不存在的视频链接 39 | """ 40 | if not videos: 41 | return [] 42 | 43 | video_url_md5s = [x.video_url_md5 for x in videos] 44 | session = DBSession() 45 | query_result = session.query(cls.video_url_md5).\ 46 | filter(cls.video_url_md5.in_(video_url_md5s)).all() 47 | session.commit() 48 | exist_urls = {x[0] for x in query_result} 49 | return [x for x in videos if x.video_url_md5 not in exist_urls] 50 | 51 | @classmethod 52 | def batch_add(cls, videos): 53 | """批量添加记录 54 | 55 | Args: 56 | videos (list): video 下载链接 57 | """ 58 | if not videos: 59 | return 60 | 61 | records = [cls( 62 | source=x.source, 63 | task_id=x.task_id, 64 | img_url=x.img_url, 65 | duration=x.duration, 66 | title=x.title, 67 | video_url=x.video_url, 68 | video_url_md5=x.video_url_md5, 69 | ) for x in videos] 70 | session = DBSession() 71 | session.add_all(records) 72 | session.flush() 73 | session.commit() 74 | return records 75 | 76 | 77 | class DownloadInfo(BaseModel): 78 | 79 | __tablename__ = 'download_info' 80 | 81 | id = Column(Integer, primary_key=True) 82 | video_id = Column(Integer, nullable=False) 83 | video_url = Column(String(200), nullable=False) 84 | video_title = Column(String(200), nullable=False) 85 | video_size = Column(Integer, nullable=False) 86 | status = Column(SmallInteger, nullable=False, default=0) 87 | created_at = Column(DateTime, nullable=False, default=datetime.datetime.now) 88 | updated_at = Column(DateTime, nullable=False, default=datetime.datetime.now, 89 | onupdate=datetime.datetime.now) 90 | 91 | @classmethod 92 | def add(cls, video_info): 93 | """添加记录 94 | 95 | Args: 96 | video_info (VideoInfo): 格式信息 97 | """ 98 | record = cls( 99 | video_id=video_info.video_id, 100 | video_url=video_info.video_url, 101 | video_title=video_info.title, 102 | video_size=video_info.size, 103 | ) 104 | session = DBSession() 105 | session.add(record) 106 | session.flush() 107 | session.commit() 108 | return record 109 | 110 | @classmethod 111 | def update_status(cls, video_id, status=1): 112 | """添加记录 113 | 114 | Args: 115 | video_id (int): 视频id 116 | status (int): 1下载完成 0未下载 117 | """ 118 | session = DBSession() 119 | target = session.query(cls).filter(cls.video_id == video_id) 120 | target.update({'status': status}) 121 | session.commit() 122 | --------------------------------------------------------------------------------