├── spider
    ├── tools
    │   ├── __init__.py
    │   ├── db.py
    │   ├── statsd.py
    │   ├── task.py
    │   └── common.py
    ├── publish.py
    ├── config
    │   ├── __init__.py
    │   ├── sites.yaml
    │   ├── params.yaml
    │   ├── logging.yaml
    │   └── conf.py
    ├── pull
    │   ├── __init__.py
    │   └── you_get.py
    ├── __init__.py
    ├── extract
    │   ├── __init__.py
    │   ├── bilibili.py
    │   └── miaopai.py
    ├── models
    │   ├── __init__.py
    │   ├── tables.sql
    │   └── videos.py
    ├── celeryconfig.py
    ├── download.py
    └── parse.py
├── doc
    ├── parker.png
    └── jiankong.png
├── requirements.txt
├── .gitignore
└── README.md


/spider/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spider/publish.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """发布下载的视频"""
3 | 


--------------------------------------------------------------------------------
/spider/config/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """爬虫配置"""
3 | 


--------------------------------------------------------------------------------
/spider/pull/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """拉取视频"""
3 | 


--------------------------------------------------------------------------------
/doc/parker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiuRoy/parker/HEAD/doc/parker.png


--------------------------------------------------------------------------------
/doc/jiankong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiuRoy/parker/HEAD/doc/jiankong.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | celery==4.0.2
2 | you-get==0.4.652
3 | PyYAML==3.12
4 | requests==2.13.0
5 | pyquery==1.2.17
6 | SQLAlchemy==1.1.6
7 | PyMySQL==0.7.10
8 | statsd==3.2.1
9 | 


--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf8 -*-
2 | """爬虫"""
3 | 
4 | from celery import Celery
5 | 
6 | app = Celery("spider")
7 | app.config_from_object("spider.celeryconfig")
8 | 


--------------------------------------------------------------------------------
/spider/extract/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """解析页面获取视频地址"""
 3 | 
 4 | from . import (
 5 |     bilibili,
 6 |     miaopai,
 7 | )
 8 | 
 9 | __all__ = ['bilibili', 'miaopai']
10 | 


--------------------------------------------------------------------------------
/spider/tools/db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.orm import sessionmaker
2 | from sqlalchemy.engine import create_engine
3 | 
4 | 
5 | def make_session(db_url):
6 |     """根据数据库配置生成会话对象"""
7 |     engine = create_engine(db_url)
8 |     return sessionmaker(bind=engine)
9 | 


--------------------------------------------------------------------------------
/spider/tools/statsd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """假监控客户端"""
 3 | 
 4 | 
 5 | class FakeStatsdClient(object):
 6 |     """假客户端"""
 7 |     def __init__(self, *args, **kwargs):
 8 |         pass
 9 | 
10 |     def incr(self, *args, **kwargs):
11 |         pass
12 | 


--------------------------------------------------------------------------------
/spider/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """存储数据库"""
 3 | from spider.tools.db import make_session
 4 | from spider.config.conf import params
 5 | from sqlalchemy.ext.declarative import declarative_base
 6 | 
 7 | 
 8 | BaseModel = declarative_base()
 9 | DBSession = make_session(params['mysql_url'])
10 | 


--------------------------------------------------------------------------------
/spider/config/sites.yaml:
--------------------------------------------------------------------------------
 1 | sites:
 2 | - name: bilibili-1
 3 |   url: http://space.bilibili.com/21474566/#!/video         # 超级镜子
 4 |   task: spider.parse.bilibili
 5 |   minute: 1
 6 | - name: miaopai-2
 7 |   url: http://m.miaopai.com/v2_index/u/paike_5no3e2iw6g    # 陈翔六点半
 8 |   task: spider.parse.miaopai
 9 |   minute: 1
10 | 


--------------------------------------------------------------------------------
/spider/config/params.yaml:
--------------------------------------------------------------------------------
1 | params:
2 | - mode: release
3 |   broker_url: amqp://guest:guest@192.168.99.100:5672/parker
4 |   mysql_url: mysql+pymysql://root:root@192.168.99.100:3306/parker?charset=utf8mb4
5 |   download_path: /home/liuruoyu/Desktop/github/parker
6 |   statsd_address: 192.168.99.100:8125
7 |   video_number_per_page: 1
8 |   download_timeout: 3600
9 | 


--------------------------------------------------------------------------------
/spider/celeryconfig.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """celery配置"""
 3 | 
 4 | from spider.config.conf import (
 5 |     load_sites,
 6 |     params
 7 | )
 8 | 
 9 | BROKER_URL = params['broker_url']
10 | 
11 | CELERY_TIMEZONE = 'Asia/Shanghai'
12 | 
13 | CELERY_IMPORTS = (
14 |     'spider.parse',
15 |     'spider.download',
16 | )
17 | 
18 | CELERYBEAT_SCHEDULE = load_sites()
19 | 
20 | CELERY_TASK_SERIALIZER = "pickle"
21 | CELERY_ACCEPT_CONTENT = ['pickle', 'json']
22 | 


--------------------------------------------------------------------------------
/spider/tools/task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """celery base task"""
 3 | import celery
 4 | from spider.config.conf import logger
 5 | 
 6 | 
 7 | class ParkerTask(celery.Task):
 8 |     def on_failure(self, exc, task_id, args, kwargs, einfo):
 9 |         logger.error("task {} error".format(task_id))
10 |         logger.exception(exc)
11 |         return super(ParkerTask, self).on_failure(
12 |             exc, task_id, args, kwargs, einfo)
13 | 
14 |     def on_success(self, retval, task_id, args, kwargs):
15 |         logger.info("task {} done".format(task_id))
16 |         return super(ParkerTask, self).on_success(
17 |             retval, task_id, args, kwargs)
18 | 


--------------------------------------------------------------------------------
/spider/download.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """下载视频任务"""
 3 | from spider import app
 4 | from spider.tools.task import ParkerTask
 5 | from spider.pull.you_get import (
 6 |     get_video_info,
 7 |     download_video
 8 | )
 9 | 
10 | 
11 | @app.task(base=ParkerTask)
12 | def bilibili(video):
13 |     """根据bilibili播放地址下载视频
14 | 
15 |     Args:
16 |         video (Videos): 视频记录
17 |     """
18 |     video_info = get_video_info(video.video_url, video.task_id)
19 |     download_video(video_info)
20 | 
21 | 
22 | @app.task(base=ParkerTask)
23 | def miaopai(video):
24 |     """根据miaopai播放地址下载视频
25 | 
26 |     Args:
27 |         video (Videos): 视频记录
28 |     """
29 |     video_info = get_video_info(video.video_url, video.task_id)
30 |     download_video(video_info)
31 | 


--------------------------------------------------------------------------------
/spider/parse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """解析页面任务"""
 3 | from spider import app
 4 | from spider import download
 5 | from spider.tools.task import ParkerTask
 6 | from spider import extract
 7 | 
 8 | 
 9 | @app.task(base=ParkerTask)
10 | def bilibili(url, name):
11 |     """抓取哔哩哔哩 解析获取最新视频地址
12 | 
13 |     Args:
14 |         url (string): 哔哩哔哩页面地址
15 |         name (string): 定时任务名称
16 |     """
17 |     new_videos = extract.bilibili.extract_videos(url, name)
18 |     if new_videos:
19 |         for video in new_videos:
20 |             download.bilibili.delay(video)
21 | 
22 | 
23 | @app.task(base=ParkerTask)
24 | def miaopai(url, name):
25 |     """抓取秒拍页面 解析获取最新视频地址
26 | 
27 |     Args:
28 |         url (string): 美拍页面地址
29 |         name (string): 定时任务名称
30 |     """
31 |     new_videos = extract.miaopai.extract_videos(url, name)
32 |     if new_videos:
33 |         for video in new_videos:
34 |             download.miaopai.delay(video)
35 | 


--------------------------------------------------------------------------------
/spider/config/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: no
 3 | 
 4 | formatters:
 5 |   simple:
 6 |     format: '%(asctime)s [%(levelname)s] [%(process)d] %(filename)s-%(lineno)d: %(message)s'
 7 |   detail:
 8 |     format: '%(asctime)s [%(levelname)s] [%(process)d] %(pathname)s-%(lineno)d: %(message)s'
 9 | 
10 | handlers:
11 |   console:
12 |     class: logging.StreamHandler
13 |     level: DEBUG
14 |     formatter: detail
15 |     stream: ext://sys.stdout
16 |   files:
17 |     class: logging.handlers.WatchedFileHandler
18 |     level: DEBUG
19 |     formatter: simple
20 |     filename: /home/liuruoyu/Desktop/github/parker/worker.log
21 | 
22 | loggers:
23 |   parker.debug:
24 |     level: DEBUG
25 |     handlers: [console]
26 |     propagate: no
27 |     qualname: parker.debug
28 |   parker.release:
29 |     level: INFO
30 |     handlers: [files]
31 |     propagate: no
32 |     qualname: parker.release
33 | 
34 | root:
35 |   level: DEBUG
36 |   handlers: [console]
37 | 


--------------------------------------------------------------------------------
/spider/models/tables.sql:
--------------------------------------------------------------------------------
 1 | CREATE SCHEMA `parker` DEFAULT CHARACTER SET utf8mb4 ;
 2 | 
 3 | CREATE TABLE `download_info` (
 4 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 5 |   `video_id` int(11) NOT NULL COMMENT '视频id',
 6 |   `video_url` varchar(200) NOT NULL COMMENT '播放url',
 7 |   `video_title` varchar(200) NOT NULL COMMENT '视频标题',
 8 |   `video_size` int(11) NOT NULL DEFAULT '0' COMMENT '视频大小',
 9 |   `status` tinyint(4) NOT NULL DEFAULT '0' COMMENT '是否下载完成 1下载完成 0未下载',
10 |   `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
11 |   `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
12 |   PRIMARY KEY (`id`),
13 |   KEY `ix_video_id` (`video_id`)
14 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='视频下载信息表';
15 | 
16 | CREATE TABLE `web_video` (
17 |   `id` int(11) NOT NULL AUTO_INCREMENT,
18 |   `source` varchar(10) NOT NULL COMMENT '网站类型',
19 |   `task_id` int(11) NOT NULL COMMENT '任务id',
20 |   `img_url` varchar(200) NOT NULL COMMENT '视频封面链接',
21 |   `duration` int(11) NOT NULL COMMENT '视频时长',
22 |   `title` varchar(200) NOT NULL DEFAULT '' COMMENT '视频标题',
23 |   `video_url` varchar(200) NOT NULL COMMENT '视频播放页面',
24 |   `video_url_md5` varchar(32) NOT NULL COMMENT '视频播放页面',
25 |   `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
26 |   PRIMARY KEY (`id`),
27 |   KEY `ix_video_url_md5` (`video_url_md5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='网站视频信息';
29 | 


--------------------------------------------------------------------------------
/spider/config/conf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """解析配置文件"""
 3 | import os
 4 | import yaml
 5 | import logging
 6 | import logging.config
 7 | from datetime import timedelta
 8 | 
 9 | import statsd
10 | from spider.tools.statsd import FakeStatsdClient
11 | 
12 | current_dir = os.path.dirname(__file__)
13 | sites_path = os.path.join(current_dir, 'sites.yaml')
14 | params_path = os.path.join(current_dir, 'params.yaml')
15 | logging_path = os.path.join(current_dir, 'logging.yaml')
16 | 
17 | with open(logging_path, 'r') as f:
18 |     logging.config.dictConfig(yaml.load(f))
19 | 
20 | 
21 | def load_sites():
22 |     """解析sites.yaml 生成CELERYBEAT_SCHEDULE"""
23 |     with open(sites_path, 'r') as f:
24 |         sites = yaml.load(f)
25 |         return {x['name']: {
26 |             'task': x['task'],
27 |             'schedule': timedelta(minutes=int(x['minute'])),
28 |             'args': (x['url'], x['name'])
29 |         } for x in sites['sites']}
30 | 
31 | 
32 | def load_params():
33 |     """解析params.yaml"""
34 |     with open(params_path, 'r') as f:
35 |         p = yaml.load(f)
36 |         return p['params'][0]
37 | 
38 | 
39 | params = load_params()
40 | if params['mode'] == 'debug':
41 |     logger = logging.getLogger('parker.debug')
42 |     statsd_client = FakeStatsdClient()
43 | else:
44 |     logger = logging.getLogger('parker.release')
45 |     statsd_host, statsd_port = params['statsd_address'].split(':')
46 |     statsd_client = statsd.StatsClient(
47 |         host=statsd_host, port=int(statsd_port), prefix='parker')
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | ### Python template
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | .hypothesis/
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | 
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 | 
62 | # Scrapy stuff:
63 | .scrapy
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 | 
74 | # pyenv
75 | .python-version
76 | 
77 | # celery beat schedule file
78 | celerybeat-schedule
79 | 
80 | # dotenv
81 | .env
82 | 
83 | # virtualenv
84 | .venv/
85 | venv/
86 | ENV/
87 | 
88 | # Spyder project settings
89 | .spyderproject
90 | 
91 | # Rope project settings
92 | .ropeproject
93 | 
94 | .idea/
95 | 


--------------------------------------------------------------------------------
/spider/tools/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """一些公用结构体"""
 3 | import hashlib
 4 | from collections import namedtuple
 5 | 
 6 | 
 7 | WebVideo = namedtuple(
 8 |     'WebVideo', [
 9 |         'source',           # string 网站类型
10 |         'task_id',          # int    定时任务id
11 |         'img_url',          # string 图片链接
12 |         'duration',         # int    播放时长
13 |         'title',            # string 视频标题
14 |         'video_url',        # string 视频链接
15 |         'video_url_md5',    # string 视频链接md值
16 |     ])
17 | 
18 | VideoInfo = namedtuple(
19 |     'VideoInfo', [
20 |         'video_id',        # int Videos记录id
21 |         'video_url',       # string 播放url
22 |         'title',           # string 视频标题
23 |         'size',            # int 视频大小
24 |     ])
25 | 
26 | 
27 | def parse_task(task_name):
28 |     """解析定时任务名称获取任务类型和id
29 | 
30 |     Args:
31 |         task_name (string): 任务名称
32 |     Returns:
33 |         source (string): 网站类型
34 |         task_id int: 任务id
35 |     """
36 |     source, task_id = task_name.split('-')
37 |     return source, int(task_id)
38 | 
39 | 
40 | def parse_video_time(v_time):
41 |     """解析视频时长
42 | 
43 |     Args:
44 |         v_time (string): 视频时长 格式: 4:50:89 01:29
45 |     Returns:
46 |         int 视频时长秒数
47 |     """
48 |     items = v_time.split(':')
49 |     if len(items) == 2:
50 |         return int(items[0]) * 60 + int(items[1])
51 | 
52 |     if len(items) == 3:
53 |         return int(items[0]) * 3600 + int(items[1]) * 60 + int(items[2])
54 | 
55 |     return 0
56 | 
57 | 
58 | def get_md5(content):
59 |     """计算md5
60 | 
61 |     Args:
62 |         content (string): 要计算md5的字符串
63 |     Returns:
64 |         string: 计算好的md5值
65 |     """
66 |     md5 = hashlib.md5()
67 |     md5.update(content.encode('utf-8'))
68 |     return md5.hexdigest()
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # parker
 2 | 
 3 | parker短视频爬虫项目（项目地址：[https://github.com/LiuRoy/parker](https://github.com/LiuRoy/parker)），它采用celery框架定时爬取用户视频列表，将最新发布的视频通过you-get异步下载，可以很方便地实现分布式部署。因为各个网站的页面布局和接口更新比较频繁，为了保证程序的高可用，特意增加了Statsd监控，方便及时发现出错。
 4 | 
 5 | ## 代码架构
 6 | 
 7 | ![代码架构](./doc/parker.png)
 8 | 
 9 | 目前parker中只实现了B站和秒拍的下载，从框架图可以看出，针对每一类网站，需要实现两个异步接口：从用户视频主页解析发布视频的播放地址、根据播放地址下载视频。因此增加网站类型，不需要修改原来的代码，只需要添加新的解析和下载接口即可。针对视频下载完成之后的后续操作，我还没有实现，大家可以根据自己的需求自由的去实现。
10 | 
11 | 在运行的时候，celery会将配置好的优质用户列表定时发送到对应网站的解析接口异步执行，筛选出最新发布的视频播放地址，交给对应的下载接口异步下载，下载完成之后再异步调用后续操作。因此需要启动一个celery beat进程发送定时任务，以及若干celery异步任务去执行解析和下载操作，对于比较大的视频，下载会相当耗时，建议根据任务列表的多少合理分配异步任务的个数。
12 | 
13 | ## 程序运行
14 | 
15 | 经验证，此程序可以在ubuntu和mac下正常运行， 由于本地windows下的celery无法正常启动，所以没有在windows环境做过验证。
16 | 
17 | ### 依赖库安装
18 | 
19 | python版本为3.5，进入项目目录后，执行：
20 | 
21 | ```bash
22 | pip install -r requirements.txt
23 | ```
24 | 
25 | ### 创建数据库表
26 | 
27 | 提前在数据库中建好两张表（sql: [https://github.com/LiuRoy/parker/blob/master/spider/models/tables.sql](https://github.com/LiuRoy/parker/blob/master/spider/models/tables.sql)）
28 | 
29 | ### 参数配置
30 | 
31 | config路径下的logging.yaml、params.yaml、sites.yaml分别对应日志配置、运行参数配置、热门用户配置。
32 | 
33 | #### 日志配置
34 | 
35 | debug模式下日志会直接输出在标准输出流，release模式下会将日志内容输出到文件中，因此需要配置输出日志文件。
36 | 
37 | #### 运行配置
38 | 
39 | + mode debug调试模式，此模式下日志指向标准输出，并且没有监控数据；release模式下，日志输出到制定文件，并且有监控数据。
40 | + broker_url 对应于celery的BROKER_URL，可以配置为redis或者rabbitmq
41 | + mysql_url 数据库地址，需要提前建好两张表
42 | + download_path 视频下载路径
43 | + statsd_address 监控地址
44 | + video_number_per_page 每次从用户视频主页解析出多少条视频播放地址，因为大部分用户每次发布的视频个数很少，只需要设置成一个很小的值即可。在初次运行的时候，也不会下载大量久远的视频。
45 | + download_timeout 视频下载的超时时间
46 | 
47 | #### 热门用户配置
48 | 
49 | parker会根据此配置生成一份celery beat scheduler列表。
50 | 
51 | + name 规则是`<网站类型>-<任务id>`，parker会根据此作为scheduler任务名称
52 | + url 用户的发布视频主页
53 | + task 对应的celery解析异步任务
54 | + minute 多少分钟检查一次用户视频列表
55 | 
56 | ### 启动任务
57 | 
58 | 进入项目目录，执行下面命令启动celery worker
59 | 
60 | ```bash
61 | celery -A spider worker
62 | ```
63 | 
64 | 执行下面命令启动celery beat定时任务
65 | 
66 | ```bash
67 | celery -A spider beat
68 | ```
69 | 
70 | ## 监控
71 | 
72 | 强烈安利一个docker镜像 [https://hub.docker.com/r/samuelebistoletti/docker-statsd-influxdb-grafana/](https://hub.docker.com/r/samuelebistoletti/docker-statsd-influxdb-grafana/)，一分钟配好监控环境有木有。之后只需要添加执行成功和执行异常的打点数据，就可以方便的监控程序是否正常运行了。
73 | 
74 | ![监控](./doc/jiankong.png)


--------------------------------------------------------------------------------
/spider/extract/bilibili.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """解析bilibili页面"""
 3 | import re
 4 | import requests
 5 | from spider.tools.common import (
 6 |     WebVideo,
 7 |     get_md5,
 8 |     parse_task,
 9 |     parse_video_time,
10 | )
11 | from spider.models.videos import Videos
12 | from spider.config.conf import (
13 |     params,
14 |     logger,
15 |     statsd_client,
16 | )
17 | 
18 | 
19 | def get_user_id(url):
20 |     """从个人视频页链接解析出用户id
21 | 
22 |     Args:
23 |         url (string): 人视频页链接
24 |     Returns：
25 |         int: 用户id
26 |     """
27 |     result = re.search(r'\d+', url)
28 |     return int(result.group())
29 | 
30 | 
31 | def get_video_lists(video_data, source, task_id):
32 |     """从获取的视频数据中找到感兴趣的数据
33 | 
34 |     Args:
35 |         video_data (dict): 视频数据
36 |         source (string): 网站类型
37 |         task_id (int): 任务id
38 |     Returns:
39 |         list<WebVideo> 视频信息列表
40 |     """
41 |     result = []
42 |     for item in video_data['data']['vlist']:
43 |         v_time = item['length']
44 |         video_url = 'http://www.bilibili.com/video/av{}/'.format(item['aid'])
45 | 
46 |         result.append(WebVideo(
47 |             source=source,
48 |             task_id=task_id,
49 |             img_url=item['pic'],
50 |             duration=parse_video_time(v_time),
51 |             title=item['title'],
52 |             video_url=video_url,
53 |             video_url_md5=get_md5(video_url)
54 |         ))
55 |     return result
56 | 
57 | 
58 | def extract_videos(url, name):
59 |     """解析视频列表
60 | 
61 |     Args:
62 |         url (string): 哔哩哔哩页面地址
63 |         name (string): 定时任务名称
64 |     """
65 |     user_id = get_user_id(url)
66 |     source, task_id = parse_task(name)
67 | 
68 |     request_url = 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&' \
69 |                   'pagesize={}&tid=0&page=1&keyword=&order=senddate'.format(
70 |         user_id, params['video_number_per_page'])
71 |     try:
72 |         response = requests.get(request_url, timeout=10)
73 |         if not response.ok:
74 |             statsd_client.incr('bilibili.extract.exc')
75 |             logger.error('request failure. url:{} name:{}'.format(request_url, name))
76 |             return
77 | 
78 |         video_data = response.json()
79 |         videos = get_video_lists(video_data, source, task_id)
80 |     except Exception as exc:
81 |         statsd_client.incr('bilibili.extract.exc')
82 |         logger.error('request failure. url:{} name:{}'.format(request_url, name))
83 |         logger.exception(exc)
84 |     else:
85 |         statsd_client.incr('bilibili.extract.suc')
86 |         logger.info('request success. url:{} name:{}'.format(request_url, name))
87 |         new_videos = Videos.filter_exist(videos)
88 |         Videos.batch_add(new_videos)
89 |         return new_videos
90 | 


--------------------------------------------------------------------------------
/spider/pull/you_get.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """用you-get库下载视频"""
 3 | import re
 4 | import subprocess
 5 | 
 6 | from spider.config.conf import params
 7 | from spider.tools.common import VideoInfo
 8 | from spider.models.videos import DownloadInfo
 9 | from spider.config.conf import (
10 |     logger,
11 |     statsd_client,
12 | )
13 | 
14 | 
15 | def parse_size(size_info):
16 |     """解析视频大小信息
17 | 
18 |     Args:
19 |         size_info (string): eg: 62.9 MiB (65951953 bytes)
20 |     Returns:
21 |         int
22 |     """
23 |     result = re.search(r'(\d+) bytes', size_info)
24 |     if result:
25 |         return int(result.group(1))
26 |     return 0
27 | 
28 | 
29 | def get_video_info(play_url, video_id):
30 |     """获取视频信息
31 | 
32 |     Args:
33 |         play_url (string): 播放地址
34 |         video_id (int): 视频id
35 |     """
36 |     try:
37 |         info_comd = "you-get -i {}".format(play_url)
38 |         p = subprocess.Popen(info_comd, shell=True, stdout=subprocess.PIPE)
39 |         p.wait()
40 |         content = p.stdout.read()
41 |         content = content.decode('utf-8')
42 | 
43 |         video_title = re.search(r'Title:\s+(.*?)\s+Type:', content).group(1)
44 |         video_size = int(re.search(r'\((\d+) Bytes\)', content).group(1))
45 | 
46 |         video_info = VideoInfo(
47 |             video_id=video_id,
48 |             video_url=play_url,
49 |             title=video_title,
50 |             size=video_size,
51 |         )
52 |         logger.info(video_info)
53 |     except Exception as exc:
54 |         statsd_client.incr('youget.info.exc')
55 |         logger.error('you-get info failure: url:{} video:{}'.format(
56 |             play_url, video_id))
57 |         logger.error(exc)
58 |     else:
59 |         statsd_client.incr('youget.info.suc')
60 |         logger.info('you-get info success: url:{} video:{}'.format(
61 |             play_url, video_id))
62 |         DownloadInfo.add(video_info)
63 |         return video_info
64 | 
65 | 
66 | def download_video(video_info):
67 |     """下载视频
68 | 
69 |     Args:
70 |         video_info (VideoInfo): 视频下载信息
71 |     """
72 |     try:
73 |         download_comd = "you-get {}".format(video_info.video_url)
74 |         p = subprocess.Popen(download_comd,
75 |                              shell=True, cwd=params['download_path'])
76 |         p.wait(int(params['download_timeout']))
77 |         if p.returncode != 0:
78 |             raise Exception("download failed")
79 |     except Exception as exc:
80 |         statsd_client.incr('youget.download.exc')
81 |         logger.error('you-get download failure: url:{} video:{}'.format(
82 |             video_info.video_url, video_info.video_id))
83 |         logger.exception(exc)
84 |     else:
85 |         statsd_client.incr('youget.download.suc')
86 |         logger.info('you-get download success: url:{} video:{}'.format(
87 |             video_info.video_url, video_info.video_id))
88 |         DownloadInfo.update_status(video_info.video_id)
89 | 


--------------------------------------------------------------------------------
/spider/extract/miaopai.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """获取没拍视频列表"""
  3 | import re
  4 | import os
  5 | import requests
  6 | from pyquery import PyQuery
  7 | 
  8 | from spider.tools.common import (
  9 |     WebVideo,
 10 |     get_md5,
 11 |     parse_task,
 12 | )
 13 | from spider.models.videos import Videos
 14 | from spider.config.conf import (
 15 |     params,
 16 |     logger,
 17 |     statsd_client,
 18 | )
 19 | 
 20 | 
 21 | def get_user_id(html):
 22 |     """从html中找到user_id
 23 | 
 24 |     Args:
 25 |         html (string): 页面
 26 |     Returns:
 27 |         string: 用户id
 28 |     """
 29 |     result = re.search(r"var suid = '(.+?)';", html)
 30 |     return result.group(1)
 31 | 
 32 | 
 33 | def get_video_lists(html, source, task_id):
 34 |     """从获取的视频数据中找到感兴趣的数据
 35 | 
 36 |     Args:
 37 |         html (string): 视频html页面数据
 38 |         source (string): 网站类型
 39 |         task_id (int): 任务id
 40 |     Returns:
 41 |         list<WebVideo> 视频信息列表
 42 |     """
 43 |     page = PyQuery(html)
 44 |     result = []
 45 |     for item in page("div[class='card_wrapping']"):
 46 |         title = item.xpath("./div[@class='h_title']")[0].text
 47 |         img_url = item.xpath("./a/div")[0].get("data-url")
 48 |         play_data = os.path.basename(img_url)
 49 |         video_url = 'http://www.miaopai.com/show/{}.htm'.format(
 50 |             play_data.split('_')[0])
 51 | 
 52 |         result.append(WebVideo(
 53 |             source=source,
 54 |             task_id=task_id,
 55 |             img_url=img_url,
 56 |             duration=0,
 57 |             title=title,
 58 |             video_url=video_url,
 59 |             video_url_md5=get_md5(video_url)
 60 |         ))
 61 |     return result
 62 | 
 63 | 
 64 | def extract_videos(url, name):
 65 |     """解析视频列表
 66 | 
 67 |     Args:
 68 |         url (string): 哔哩哔哩页面地址
 69 |         name (string): 定时任务名称
 70 |     """
 71 |     source, task_id = parse_task(name)
 72 |     try:
 73 |         response = requests.get(url, timeout=10)
 74 |         if not response.ok:
 75 |             statsd_client.incr('miaopai.extract.exc')
 76 |             logger.error('request failure. url:{} name:{}'.format(url, name))
 77 |             return
 78 | 
 79 |         user_id = get_user_id(response.text)
 80 |         video_page_url = 'http://m.miaopai.com/show/getOwnerVideo?suid={}&page=1&per={}'.format(
 81 |             user_id, params['video_number_per_page'])
 82 |         response = requests.get(video_page_url, timeout=10)
 83 |         if not response.ok:
 84 |             statsd_client.incr('miaopai.extract.exc')
 85 |             logger.error('request failure. url:{} name:{}'.format(video_page_url, name))
 86 |             return
 87 | 
 88 |         video_div = response.json()['msg']
 89 |         videos = get_video_lists(video_div, source, task_id)
 90 |     except Exception as exc:
 91 |         statsd_client.incr('miaopai.extract.exc')
 92 |         logger.error('request failure. name:{}'.format(name))
 93 |         logger.exception(exc)
 94 |     else:
 95 |         statsd_client.incr('miaopai.extract.suc')
 96 |         logger.info('request success. name:{}'.format(name))
 97 |         new_videos = Videos.filter_exist(videos)
 98 |         Videos.batch_add(new_videos)
 99 |         return new_videos
100 | 


--------------------------------------------------------------------------------
/spider/models/videos.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """视频存储"""
  3 | import datetime
  4 | from sqlalchemy import (
  5 |     Column,
  6 |     String,
  7 |     Integer,
  8 |     SmallInteger,
  9 |     DateTime,
 10 | )
 11 | from spider.models import (
 12 |     DBSession,
 13 |     BaseModel,
 14 | )
 15 | 
 16 | 
 17 | class Videos(BaseModel):
 18 | 
 19 |     __tablename__ = 'web_video'
 20 | 
 21 |     id = Column(Integer, primary_key=True)
 22 |     source = Column(String(10), nullable=False)
 23 |     task_id = Column(Integer, nullable=False)
 24 |     img_url = Column(String(200), nullable=False)
 25 |     duration = Column(Integer, nullable=False)
 26 |     title = Column(String(200), nullable=False)
 27 |     video_url = Column(String(200), nullable=False)
 28 |     video_url_md5 = Column(String(32), nullable=False)
 29 |     created_at = Column(DateTime, nullable=False, default=datetime.datetime.now)
 30 | 
 31 |     @classmethod
 32 |     def filter_exist(cls, videos):
 33 |         """将已经存在表中的数据滤除
 34 | 
 35 |         Args:
 36 |             videos (list<WebVideo>): video 下载链接
 37 |         Returns:
 38 |             list<WebVideo> 返回不存在的视频链接
 39 |         """
 40 |         if not videos:
 41 |             return []
 42 | 
 43 |         video_url_md5s = [x.video_url_md5 for x in videos]
 44 |         session = DBSession()
 45 |         query_result = session.query(cls.video_url_md5).\
 46 |             filter(cls.video_url_md5.in_(video_url_md5s)).all()
 47 |         session.commit()
 48 |         exist_urls = {x[0] for x in query_result}
 49 |         return [x for x in videos if x.video_url_md5 not in exist_urls]
 50 | 
 51 |     @classmethod
 52 |     def batch_add(cls, videos):
 53 |         """批量添加记录
 54 | 
 55 |         Args:
 56 |             videos (list<WebVideo>): video 下载链接
 57 |         """
 58 |         if not videos:
 59 |             return
 60 | 
 61 |         records = [cls(
 62 |             source=x.source,
 63 |             task_id=x.task_id,
 64 |             img_url=x.img_url,
 65 |             duration=x.duration,
 66 |             title=x.title,
 67 |             video_url=x.video_url,
 68 |             video_url_md5=x.video_url_md5,
 69 |         ) for x in videos]
 70 |         session = DBSession()
 71 |         session.add_all(records)
 72 |         session.flush()
 73 |         session.commit()
 74 |         return records
 75 | 
 76 | 
 77 | class DownloadInfo(BaseModel):
 78 | 
 79 |     __tablename__ = 'download_info'
 80 | 
 81 |     id = Column(Integer, primary_key=True)
 82 |     video_id = Column(Integer, nullable=False)
 83 |     video_url = Column(String(200), nullable=False)
 84 |     video_title = Column(String(200), nullable=False)
 85 |     video_size = Column(Integer, nullable=False)
 86 |     status = Column(SmallInteger, nullable=False, default=0)
 87 |     created_at = Column(DateTime, nullable=False, default=datetime.datetime.now)
 88 |     updated_at = Column(DateTime, nullable=False, default=datetime.datetime.now,
 89 |                         onupdate=datetime.datetime.now)
 90 | 
 91 |     @classmethod
 92 |     def add(cls, video_info):
 93 |         """添加记录
 94 | 
 95 |         Args:
 96 |             video_info (VideoInfo): 格式信息
 97 |         """
 98 |         record = cls(
 99 |             video_id=video_info.video_id,
100 |             video_url=video_info.video_url,
101 |             video_title=video_info.title,
102 |             video_size=video_info.size,
103 |         )
104 |         session = DBSession()
105 |         session.add(record)
106 |         session.flush()
107 |         session.commit()
108 |         return record
109 | 
110 |     @classmethod
111 |     def update_status(cls, video_id, status=1):
112 |         """添加记录
113 | 
114 |         Args:
115 |             video_id (int): 视频id
116 |             status (int): 1下载完成 0未下载
117 |         """
118 |         session = DBSession()
119 |         target = session.query(cls).filter(cls.video_id == video_id)
120 |         target.update({'status': status})
121 |         session.commit()
122 | 


--------------------------------------------------------------------------------