├── .gitignore
├── .idea
├── .gitignore
├── crawloop.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── engine.docker-compose.yml
├── image_build
├── msyh.ttf
├── playwright-xvfb
└── sources.list
├── services
├── engine
│ ├── .dockerignore
│ ├── Dockerfile-dev
│ ├── aps
│ │ ├── __init__.py
│ │ ├── func.py
│ │ ├── logger.py
│ │ └── server.py
│ ├── apserver.py
│ ├── build.sh
│ ├── build_sentry_ini.py
│ ├── compile.py
│ ├── entrypoint.sh
│ ├── grpcserver.py
│ ├── gunicorn_config.py
│ ├── gunicorn_logging.ini
│ ├── manage.py
│ ├── migrations
│ │ ├── README
│ │ ├── alembic.ini
│ │ ├── env.py
│ │ ├── script.py.mako
│ │ └── versions
│ │ │ ├── 1569921cac58_加入响应时间和渲染时间.py
│ │ │ ├── 4a243739ef84_初始化.py
│ │ │ ├── 5b189e0161ee_加入网站编码.py
│ │ │ ├── 71bf761944f8_新增网站图标字段.py
│ │ │ └── b3bd5bc9e4e3_增加extra客户端额外数据.py
│ ├── requirements.txt
│ ├── rpc
│ │ ├── __init__.py
│ │ ├── client
│ │ │ ├── __init__.py
│ │ │ └── callback_client.py
│ │ ├── codegen.sh
│ │ ├── pb
│ │ │ ├── __init__.py
│ │ │ ├── callback_pb2.py
│ │ │ ├── callback_pb2_grpc.py
│ │ │ ├── result_pb2.py
│ │ │ └── result_pb2_grpc.py
│ │ ├── protos
│ │ │ └── result.proto
│ │ └── server
│ │ │ ├── __init__.py
│ │ │ └── result.py
│ ├── sources.list
│ ├── wappalyzer
│ │ ├── __init__.py
│ │ ├── data.json
│ │ ├── helper.py
│ │ ├── modelcalss.py
│ │ └── wappalyzerhandler.py
│ ├── webs
│ │ ├── __init__.py
│ │ ├── api
│ │ │ ├── __init__.py
│ │ │ ├── bizs
│ │ │ │ ├── __init__.py
│ │ │ │ ├── result.py
│ │ │ │ └── task.py
│ │ │ ├── exceptions
│ │ │ │ ├── __init__.py
│ │ │ │ └── customs.py
│ │ │ ├── models
│ │ │ │ ├── __init__.py
│ │ │ │ ├── apscheduler_job.py
│ │ │ │ ├── base_model.py
│ │ │ │ ├── db_proxy
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── apschedule.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── result.py
│ │ │ │ │ ├── schedule_task.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ ├── subtask.py
│ │ │ │ │ ├── task.py
│ │ │ │ │ ├── task_url.py
│ │ │ │ │ └── url.py
│ │ │ │ ├── result.py
│ │ │ │ ├── server.py
│ │ │ │ ├── task.py
│ │ │ │ ├── task_url.py
│ │ │ │ └── url.py
│ │ │ ├── schemas
│ │ │ │ ├── __init__.py
│ │ │ │ ├── results.py
│ │ │ │ └── tasks.py
│ │ │ ├── utils
│ │ │ │ ├── __init__.py
│ │ │ │ ├── helper.py
│ │ │ │ ├── loggers.py
│ │ │ │ ├── requests.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── routers.py
│ │ │ │ └── settings.py
│ │ │ └── views
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ping.py
│ │ │ │ ├── results.py
│ │ │ │ └── tasks.py
│ │ ├── config.py
│ │ └── core
│ │ │ ├── __init__.py
│ │ │ └── requests
│ │ │ ├── __init__.py
│ │ │ └── request.py
│ └── worker
│ │ ├── __init__.py
│ │ ├── engine.py
│ │ ├── library
│ │ ├── __init__.py
│ │ ├── favicon.py
│ │ └── helper.py
│ │ └── result.py
└── spider
│ ├── .dockerignore
│ ├── Dockerfile-dev
│ ├── Dockerfile-prod
│ ├── build.sh
│ ├── build_sentry_ini.py
│ ├── compile.py
│ ├── entrypoint.sh
│ ├── gunicorn_config.py
│ ├── gunicorn_logging.ini
│ ├── manage.py
│ ├── migrations
│ ├── README
│ ├── alembic.ini
│ ├── env.py
│ ├── script.py.mako
│ └── versions
│ │ ├── 81a88acb3641_记录cookies.py
│ │ └── 8efa2b9dcc87_init.py
│ ├── requirements.txt
│ ├── rpc
│ ├── __init__.py
│ ├── client
│ │ ├── __init__.py
│ │ └── result.py
│ └── pb
│ │ ├── __init__.py
│ │ ├── result_pb2.py
│ │ └── result_pb2_grpc.py
│ ├── sources.list
│ ├── webs
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── bizs
│ │ │ ├── __init__.py
│ │ │ └── crawl_task.py
│ │ ├── exceptions
│ │ │ ├── __init__.py
│ │ │ └── customs.py
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── crawl_task.py
│ │ │ ├── db_proxy
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── crawl_task.py
│ │ │ │ └── result.py
│ │ │ └── result.py
│ │ ├── schemas
│ │ │ ├── __init__.py
│ │ │ └── crawl_tasks.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── loggers.py
│ │ │ ├── requests.py
│ │ │ ├── responses.py
│ │ │ ├── routers.py
│ │ │ └── settings.py
│ │ └── views
│ │ │ ├── __init__.py
│ │ │ ├── crawl_tasks.py
│ │ │ └── ping.py
│ └── config.py
│ └── worker
│ ├── __init__.py
│ ├── fetch.py
│ ├── library
│ ├── __init__.py
│ ├── helper.py
│ └── playwright.py
│ └── results.py
├── spider.docker-conpose.yml
└── 架构图.png
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/.idea/.gitignore
--------------------------------------------------------------------------------
/.idea/crawloop.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | 1625148082549
51 |
52 |
53 | 1625148082549
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 用go简单实现了一版playwright爬虫器,发现性能不是python版能比的,后面又用go实现了[网站目录文件爆破](https://github.com/who0sy/WebsiteFuzz)简单版本,跑起来速度简直太爽了。后面有时间会用继续用go重构此项目,python版的不再维护。
3 | ------------------------------------------------------------------------------------------------------------
4 | -------------------------------------------------------------------------------------------------------------
5 |
6 |
7 |
8 | # crawloop
9 | 基于PlayWright实现对js渲染的动态网页进行抓取,包含网页源码、截图、网站入口、网页交互过程等,支持优先级任务调度。
10 |
11 | crawloop 目前支持以下特性:
12 | - 原生浏览器环境,支持chrome、firefox,并支持开启容器内虚拟显示器;
13 | - 完整DOM事件收集,自动化触发;
14 | - 全面分析收集,包括js文件,页面源码、网站截图、网站图标、标题、编码、cookies、重定向链等等;
15 | - 基于Wappalyzer实现python版网站指纹识别,主要包含网站使用技术、技术版本号、置信度等;
16 | - 支持Host绑定,可自定义添加Referer;
17 | - 支持请求代理,支持爬虫结果通过http或gRPC主动回调;
18 | - 任务进度监控:支持实时监控任务进度;
19 | - 自定义任务参数:支持在线配置调度任务入参,即时生效;
20 | - 调度中心HA(中心式):调度采用中心式设计,“调度中心”自研调度组件并支持集群部署,可保证调度中心HA;
21 | - 爬虫执行器HA(分布式):任务分布式执行,任务"爬虫执行器"支持集群部署,可保证爬虫任务执行HA;
22 | - 弹性扩容缩容:一旦有新的爬虫执行器机器上线或者下线,下次调度时将会重新分配任务;
23 | - 触发策略:提供丰富的任务触发策略,包括:Cron触发、固定间隔触发、固定延时触发、API(事件)触发、人工触发、父子任务触发;
24 | - 阻塞处理策略:调度过于密集爬虫执行器来不及处理时的处理策略,策略包括:单机串行(默认)、丢弃后续调度、覆盖之前调度;
25 | - 任务超时控制:支持自定义任务超时时间,任务运行超时将会主动中断任务;
26 | - 任务失败重试:支持自定义任务失败重试次数,当任务失败时将会按照预设的失败重试次数主动进行重试;其中分片任务支持分片粒度的失败重试;并支持断点续爬;
27 | - 路由策略:爬虫执行器集群部署时提供丰富的路由策略,包括:第一个(已实现)、最后一个(已实现)、轮询(已实现)、加权轮询(已实现)、一致性HASH(待实现)等;
28 | - 动态分片:分片广播任务以任务数量为维度进行分片(默认为100条url为一个分片),以分片为单位下发不同爬虫执行器,协同进行业务处理;在进行大数据量爬虫任务操作时可显著提升任务处理能力和速度;
29 | - 调度线程池:调度系统多线程触发调度运行,确保调度精确执行,不被堵塞;
30 | - 全异步:任务调度流程全异步化设计实现,如异步调度、异步运行、异步回调等,有效对密集调度进行流量削峰,理论上支持任意时长任务的运行;
31 | - 跨语言:调度中心与爬虫执行器提供语言无关的 RESTful API 服务,第三方任意语言可据此对接调度中心或者实现自定义爬虫执行器;
32 | - 任务优先级控制:爬虫执行器实现优先级队列,可对不同优先级任务进行隔离拆分,慢任务或权重较低任务自动降级进入"Slow"队列,避免耗尽爬虫执行器,提高系统灵活性;
33 | - 容器化:项目编译在容器内进行,进一步实现功能开箱即用;
34 |
35 |
36 | ### 环境(Docker)
37 | - Docker 18.03+
38 | - Postgresl 9.x+
39 | - Rabbitmq 3.8.x+
40 | - Docker Compose 1.24+
41 |
42 |
43 | ## 架构
44 |
45 | Crawloop的架构包括了一个主节点(Master Node)和多个工作节点(Worker Node),以及负责通信和数据储存的gRPC和Postgresql数据库。
46 |
47 | 
48 |
49 | 客户端应用向主节点请求数据,主节点通过Celery和Rabbitmq来执行任务派发调度以及负载均衡,工作节点收到任务之后,开始执行爬虫任务,并将任务结果通过gRPC回调给主节点,之后落库存储。
50 |
51 | 主节点是整个Crawloop架构的核心,属于Crawloop的中控系统。
52 |
53 | 主节点主要负责以下功能:
54 | 1. 周期性任务调度
55 | 2. 工作节点管理和通信
56 | 3. 对外API服务
57 |
58 | 主节点负责与客户端进行通信,并通过Celery将爬虫任务基于负载均衡算法异步派发给工作节点。
59 |
60 | ### 工作节点
61 |
62 | 工作节点的主要功能是执行爬虫任务和回调抓取数据与日志,并且通过gRPC跟主节点通信。通过增加工作节点数量,Crawloop可以做到横向扩展,不同的爬虫任务可以分配到不同的节点上执行。
63 |
--------------------------------------------------------------------------------
/engine.docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 |
3 | services:
4 | engine:
5 | container_name: engine
6 | build:
7 | context: ./services/engine
8 | dockerfile: Dockerfile-dev
9 | volumes:
10 | - './services/engine:/usr/src/app'
11 | ports:
12 | - 15000:5000
13 | env_file:
14 | - engine.env
15 | environment:
16 | - FLASK_APP=webs:create_app
17 | - ENDPOINT=web
18 | restart: always
19 |
20 | engine-worker:
21 | container_name: engine-worker
22 | build:
23 | context: ./services/engine
24 | dockerfile: Dockerfile-dev
25 | volumes:
26 | - './services/engine:/usr/src/app'
27 | env_file:
28 | - engine.env
29 | environment:
30 | - ENDPOINT=engine-worker
31 | restart: always
32 |
33 | engine-grpc:
34 | container_name: engine-grpc
35 | build:
36 | context: ./services/engine
37 | dockerfile: Dockerfile-dev
38 | volumes:
39 | - './services/engine:/usr/src/app'
40 | ports:
41 | - 15002:15002
42 | env_file:
43 | - engine.env
44 | environment:
45 | - ENDPOINT=engine-grpc
46 | restart: always
47 |
48 | engine-apscheduler:
49 | container_name: engine-apscheduler
50 | build:
51 | context: ./services/engine
52 | dockerfile: Dockerfile-dev
53 | volumes:
54 | - './services/engine:/usr/src/app'
55 | ports:
56 | - 15003:15003
57 | env_file:
58 | - engine.env
59 | environment:
60 | - ENDPOINT=apscheduler
61 | restart: always
--------------------------------------------------------------------------------
/image_build/msyh.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/image_build/msyh.ttf
--------------------------------------------------------------------------------
/image_build/playwright-xvfb:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
4 |
5 | # 替换默认源
6 | COPY ./sources.list /etc/apt/
7 |
8 | # 安装依赖
9 | RUN apt-get update && apt-get -y install libnss3 xvfb gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 \
10 | libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 \
11 | libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 \
12 | libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 \
13 | libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
14 |
15 | # 设置时区
16 | ENV TZ=Asia/Shanghai
17 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
18 |
19 | # 设置默认字体库
20 | COPY ./msyh.ttf /usr/share/fonts/
21 | RUN fc-cache -fv
22 |
23 | # 开启xvfb虚拟显示器
24 | RUN Xvfb -screen 0 1020x720x16 :99 &
25 | RUN export DISPLAY=:99
26 | ENV DISPLAY=:99
27 |
28 | # 安装playwright驱动
29 | RUN pip install playwright==0.162.2 -i https://pypi.douban.com/simple
30 | RUN python -m playwright install
31 |
--------------------------------------------------------------------------------
/image_build/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
--------------------------------------------------------------------------------
/services/engine/.dockerignore:
--------------------------------------------------------------------------------
1 | env
2 | *.env
3 | .dockerignore
4 | Dockerfile-dev
5 | Dockerfile-prod
6 | htmlcov
7 | celerybeat-shcedule
8 | scheduler.lock
9 | celerybeat.pid
10 |
11 |
--------------------------------------------------------------------------------
/services/engine/Dockerfile-dev:
--------------------------------------------------------------------------------
1 | # base image
2 | FROM python:3.7-slim
3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
4 |
5 | # update apt source
6 | COPY ./sources.list /etc/apt/
7 |
8 | # install netcat
9 | RUN apt-get update && \
10 | apt-get -y install netcat && \
11 | apt-get clean
12 |
13 | # set timezone
14 | ENV TZ=Asia/Shanghai
15 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
16 |
17 | # set working directory
18 | RUN mkdir -p /usr/src/app
19 | WORKDIR /usr/src/app
20 |
21 | # add and install requirements
22 | COPY ./requirements.txt /usr/src/app/requirements.txt
23 | RUN pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ && \
24 | pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
25 |
26 | # add app
27 | COPY . /usr/src/app
28 |
29 | # run server
30 | CMD ["/usr/src/app/entrypoint.sh"]
31 |
--------------------------------------------------------------------------------
/services/engine/aps/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/services/engine/aps/func.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from aps.logger import scheduler_logger
4 | from webs.api.models.db_proxy import task_model_proxy, schedule_task_proxy
5 | from manage import app
6 |
7 |
8 | def send_task_func(task_id):
9 | """下发爬取任务"""
10 | with app.app_context():
11 | scheduler_logger.info(f'开始调度任务:【task-id:{task_id}】')
12 | task_obj = task_model_proxy.find(id=task_id)
13 | if not task_obj:
14 | scheduler_logger.info(f'该任务已完成或已删除!:【task-id:{task_id}】')
15 | return
16 |
17 | # 判断当前主任务下的调度任务是否大于最大实例数
18 | running_schedule_tasks = schedule_task_proxy.query_running_schedule_tasks(task_id)
19 | max_instances = task_obj.schedule_options.get('schedule_data', {}).get('max_instances', 1)
20 | if len(running_schedule_tasks) >= max_instances:
21 | scheduler_logger.info(f'该任务已超过最大实例数,此次调度已忽略!:【task-id:{task_id}】')
22 | return
23 |
24 | # 异步切割任务下发
25 | from worker import celery_app
26 | celery_app.send_task(
27 | name='delivery_task', queue='engine',
28 | kwargs={'task_id': task_id}
29 | )
30 |
--------------------------------------------------------------------------------
/services/engine/aps/logger.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 |
5 | scheduler_logger = logging.getLogger('scheduler')
6 | stream_handler = logging.StreamHandler()
7 | fmt = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
8 | stream_handler.setFormatter(fmt)
9 | scheduler_logger.addHandler(stream_handler) # 输出到终端
10 | scheduler_logger.setLevel(logging.INFO)
11 |
--------------------------------------------------------------------------------
/services/engine/aps/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | import rpyc
6 | from apscheduler.jobstores.base import JobLookupError
7 | from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
8 | from apscheduler.schedulers import SchedulerAlreadyRunningError, SchedulerNotRunningError
9 | from apscheduler.schedulers.background import BackgroundScheduler
10 |
11 | from aps.func import send_task_func
12 | from aps.logger import scheduler_logger
13 |
14 |
15 | ##################
16 | # APScheduler配置
17 |
18 |
19 | class APSchedulerConfig(object):
20 | # 时区
21 | timezone = 'Asia/Shanghai'
22 |
23 | # 后端存储器
24 | jobstores = {
25 | 'default': SQLAlchemyJobStore(url=os.getenv('DATABASE_URL'), tablename='apscheduler_jobs')
26 | }
27 |
28 | # 执行器
29 | executors = {
30 | 'default': {'type': 'threadpool', 'max_workers': 10}
31 | }
32 |
33 | # 调度器设置
34 | job_defaults = {
35 | 'coalesce': True, # 是否启用合并运行(在几个运行时间同时到期时只运行一次)
36 | 'misfire_grace_time': 3600, # 任务的执行时间可以延迟多少秒 用于任务时间到达到时,执行器未启动下次重启时任务可以延迟时间
37 | 'max_instances': 1 # 最大实例数
38 | }
39 |
40 |
41 | ##################
42 | # APScheduler调度器
43 | class APSchedulerService(rpyc.Service):
44 | @staticmethod
45 | def start():
46 | try:
47 | apscheduler.start(paused=False)
48 | scheduler_logger.info('Started APScheduler Success!')
49 | except SchedulerAlreadyRunningError:
50 | scheduler_logger.info('APScheduler Already Running!')
51 |
52 | @staticmethod
53 | def shutdown():
54 | try:
55 | apscheduler.shutdown()
56 | except SchedulerNotRunningError:
57 | scheduler_logger.info('Scheduler has been shut down!')
58 |
59 | @staticmethod
60 | def exposed_add_task(task_id, schedule_type, schedule_data):
61 | """
62 | 添加调度任务
63 | :param task_id:
64 | :param schedule_type:
65 | :param schedule_data:
66 | :return:
67 | """
68 | trigger_map = {'instantly': None, 'datetime': 'date'}
69 | apscheduler.add_job(
70 | func=send_task_func, id=str(task_id), kwargs={'task_id': task_id},
71 | trigger=trigger_map.get(schedule_type, schedule_type),
72 | **schedule_data
73 | )
74 |
75 | @staticmethod
76 | def exposed_delete_task(task_id, jobstore=None):
77 | """
78 | 删除调度任务
79 | :param task_id:
80 | :param jobstore:
81 | :return:
82 | """
83 | try:
84 | apscheduler.remove_job(job_id=str(task_id), jobstore=jobstore)
85 | except JobLookupError:
86 | scheduler_logger.warning('Job was not found or this job has ended!')
87 |
88 | @staticmethod
89 | def exposed_pause_task(task_id, jobstore=None):
90 | """
91 | 暂停调度任务
92 | :param task_id:
93 | :param jobstore:
94 | :return:
95 | """
96 |
97 | try:
98 | apscheduler.pause_job(job_id=str(task_id), jobstore=jobstore)
99 | except JobLookupError:
100 | scheduler_logger.warning('Job was not found or this job has ended!')
101 |
102 | @staticmethod
103 | def exposed_resume_task(task_id, jobstore=None):
104 | """
105 | 恢复调度任务
106 | :param task_id:
107 | :param jobstore:
108 | :return:
109 | """
110 |
111 | try:
112 | apscheduler.resume_job(job_id=str(task_id), jobstore=jobstore)
113 | except JobLookupError:
114 | scheduler_logger.warning('Job was not found or this job has ended!')
115 |
116 |
117 | ###### 创建APScheduler
118 | apscheduler = BackgroundScheduler(
119 | jobstores=APSchedulerConfig.jobstores, executors=APSchedulerConfig.executors,
120 | job_defaults=APSchedulerConfig.job_defaults, timezone=APSchedulerConfig.timezone)
121 |
122 | ###### 创建APScheduler调度对象,供业务方调用
123 | apscheduler_server = APSchedulerService()
124 |
--------------------------------------------------------------------------------
/services/engine/apserver.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from rpyc import ThreadedServer
5 |
6 | from aps.server import apscheduler_server, APSchedulerService
7 |
8 |
9 | def runserver():
10 | """运行APSchedule RPC服务"""
11 |
12 | # 在后台运行APS
13 | apscheduler_server.start()
14 |
15 | # 启动RPC承载APScheduler
16 | server = ThreadedServer(
17 | APSchedulerService, port=15003,
18 | protocol_config={'allow_public_attrs': True, 'allow_pickle': True})
19 |
20 | # 启动RPC服务
21 | try:
22 | server.start()
23 | except (KeyboardInterrupt, SystemExit):
24 | pass
25 | finally:
26 | apscheduler_server.shutdown()
27 |
28 |
29 | if __name__ == '__main__':
30 | runserver()
31 |
--------------------------------------------------------------------------------
/services/engine/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # 清除缓存目录
4 | find . -type d -name __pycache__ | xargs rm -rf
5 |
6 | # 编译代码
7 | python3 compile.py build_ext --inplace
8 | if [ $? -ne 0 ]; then
9 | exit 1
10 | fi
11 |
12 | # 将.so文件改名
13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
16 |
17 | # 删除.py文件
18 | find ./rpc -name '*.py' | xargs rm -f
19 | find ./webs -name '*.py' | xargs rm -f
20 | find ./worker -name '*.py' | xargs rm -f
21 |
22 | # 清除不需要的文件
23 | rm -rf build
24 | rm -f .gitignore
25 | rm -f compile.py
26 | rm -f build.sh
27 |
--------------------------------------------------------------------------------
/services/engine/build_sentry_ini.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import os
5 |
6 | from configobj import ConfigObj
7 |
8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8')
9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False)
10 | log_ini.write()
11 |
--------------------------------------------------------------------------------
/services/engine/compile.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from Cython.Build import cythonize
4 | from Cython.Distutils import build_ext
5 | from setuptools import setup
6 | from setuptools.extension import Extension
7 |
8 | setup(
9 | ext_modules=cythonize(
10 | [
11 | Extension('rpc.*', ['rpc/*.py']),
12 | Extension('rpc.client.*', ['rpc/client/*.py']),
13 | Extension('rpc.pb.*', ['rpc/pb/*.py']),
14 | Extension('rpc.server.*', ['rpc/server/*.py']),
15 | Extension('webs.*', ['webs/*.py']),
16 | Extension('webs.api.*', ['webs/api/*.py']),
17 | Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']),
18 | Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']),
19 | Extension('webs.api.models*', ['webs/api/models/*.py']),
20 | Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']),
21 | Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']),
22 | Extension('webs.api.utils.*', ['webs/api/utils/*.py']),
23 | Extension('webs.api.views.*', ['webs/api/views/*.py']),
24 | Extension('webs.core.*', ['webs/core/*.py']),
25 | Extension('webs.core.requests.*', ['webs/core/requests/*.py']),
26 | Extension('worker.*', ['worker/*.py']),
27 | Extension('worker.library.*', ['worker/library/*.py'])
28 | ],
29 | build_dir='build',
30 | compiler_directives=dict(
31 | always_allow_keywords=True, language_level=3
32 | )
33 | ),
34 | cmdclass=dict(
35 | build_ext=build_ext
36 | )
37 | )
38 |
--------------------------------------------------------------------------------
/services/engine/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # web服务
4 | if [ "$ENDPOINT" = "web" ]; then
5 | # 开发环境
6 | if [ "$FLASK_ENV" = "development" ]; then
7 | flask run -h 0.0.0.0 -p 5000
8 |
9 | # 生产环境
10 | elif [ "$FLASK_ENV" = "production" ]; then
11 |
12 | # 注册sentry
13 | python build_sentry_ini.py
14 |
15 | # 使用gunicorn承载flask服务
16 | gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app
17 | fi
18 |
19 | # grpc服务端
20 | elif [ "$ENDPOINT" = "engine-grpc" ]; then
21 | python grpcserver.py
22 |
23 | # Apscheduler
24 | elif [ "$ENDPOINT" = "apscheduler" ]; then
25 | python apserver.py
26 |
27 | # worker
28 | elif [ "$ENDPOINT" = "engine-worker" ]; then
29 | # celery -A worker.celery_app worker -Q engine,base_result -l info -c 5 -n worker_engine@%h
30 | celery -A worker.celery_app worker -Q engine,base_result -l info --pool=prefork --concurrency=10 --prefetch-multiplier 4 --without-heartbeat -n worker_engine@%h
31 | fi
32 |
--------------------------------------------------------------------------------
/services/engine/grpcserver.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import time
5 | from concurrent import futures
6 |
7 | import grpc
8 |
9 | from rpc.pb import result_pb2_grpc
10 | from rpc.server.result import ResultServicer
11 |
12 |
13 | # 运行函数
14 | def run():
15 | # 以线程池运行rpc服务
16 | server = grpc.server(
17 | futures.ThreadPoolExecutor(max_workers=os.getenv('GRPC_SERVER_MAX_WORKER_COUNT', 10)),
18 | options=[
19 | (
20 | 'grpc.max_send_message_length',
21 | os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200) * 1024 * 1024
22 | ),
23 | (
24 | 'grpc.max_receive_message_length',
25 | os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200) * 1024 * 1024
26 | ),
27 | ]
28 | )
29 |
30 | ###### 注册服务
31 | # 保存爬虫基本信息
32 | result_pb2_grpc.add_ResultServicer_to_server(ResultServicer(), server)
33 |
34 | # 设置服务器监听地址
35 | server.add_insecure_port(address='0.0.0.0:15002')
36 |
37 | # 启动服务
38 | server.start()
39 |
40 | # 阻塞rpc服务
41 | try:
42 | while True:
43 | time.sleep(60 * 60 * 24)
44 | except KeyboardInterrupt:
45 | server.stop(0)
46 |
47 |
48 | if __name__ == '__main__':
49 | run()
50 |
--------------------------------------------------------------------------------
/services/engine/gunicorn_config.py:
--------------------------------------------------------------------------------
1 | # Sample Gunicorn configuration file.
2 |
3 | import multiprocessing as mlp
4 |
5 | # 解决无限递归
6 | import os
7 |
8 | import gevent.monkey
9 |
10 | gevent.monkey.patch_all()
11 |
12 | #
13 | # Server socket
14 | #
15 | # bind - The socket to bind.
16 | #
17 | # A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'.
18 | # An IP is a valid HOST.
19 | #
20 | # backlog - The number of pending connections. This refers
21 | # to the number of clients that can be waiting to be
22 | # served. Exceeding this number results in the client
23 | # getting an error when attempting to connect. It should
24 | # only affect servers under significant load.
25 | #
26 | # Must be a positive integer. Generally set in the 64-2048
27 | # range.
28 | #
29 |
30 | bind = '0.0.0.0:5000'
31 | backlog = 2048
32 |
33 | #
34 | # Worker processes
35 | #
36 | # workers - The number of worker processes that this server
37 | # should keep alive for handling requests.
38 | #
39 | # A positive integer generally in the 2-4 x $(NUM_CORES)
40 | # range. You'll want to vary this a bit to find the best
41 | # for your particular application's work load.
42 | #
43 | # worker_class - The type of workers to use. The default
44 | # sync class should handle most 'normal' types of work
45 | # loads. You'll want to read
46 | # http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type
47 | # for information on when you might want to choose one
48 | # of the other worker classes.
49 | #
50 | # A string referring to a Python path to a subclass of
51 | # gunicorn.workers.base.Worker. The default provided values
52 | # can be seen at
53 | # http://docs.gunicorn.org/en/latest/settings.html#worker-class
54 | #
55 | # worker_connections - For the eventlet and gevent worker classes
56 | # this limits the maximum number of simultaneous clients that
57 | # a single process can handle.
58 | #
59 | # A positive integer generally set to around 1000.
60 | #
61 | # timeout - If a worker does not notify the master process in this
62 | # number of seconds it is killed and a new worker is spawned
63 | # to replace it.
64 | #
65 | # Generally set to thirty seconds. Only set this noticeably
66 | # higher if you're sure of the repercussions for sync workers.
67 | # For the non sync workers it just means that the worker
68 | # process is still communicating and is not tied to the length
69 | # of time required to handle a single request.
70 | #
71 | # keepalive - The number of seconds to wait for the next request
72 | # on a Keep-Alive HTTP connection.
73 | #
74 | # A positive integer. Generally set in the 1-5 seconds range.
75 | #
76 |
77 | # Number of processes
78 | workers = mlp.cpu_count() * 2 + 1
79 |
80 | # Threads
81 | threads = mlp.cpu_count() * 2
82 |
83 | worker_class = 'gevent'
84 | worker_connections = 1000
85 | timeout = os.getenv('GUNICORN_TIMEOUT', 180)
86 | keepalive = 2
87 |
88 | #
89 | # spew - Install a trace function that spews every line of Python
90 | # that is executed when running the server. This is the
91 | # nuclear option.
92 | #
93 | # True or False
94 | #
95 |
96 | spew = False
97 |
98 | #
99 | # Server mechanics
100 | #
101 | # daemon - Detach the main Gunicorn process from the controlling
102 | # terminal with a standard fork/fork sequence.
103 | #
104 | # True or False
105 | #
106 | # raw_env - Pass environment variables to the execution environment.
107 | #
108 | # pidfile - The path to a pid file to write
109 | #
110 | # A path string or None to not write a pid file.
111 | #
112 | # user - Switch worker processes to run as this user.
113 | #
114 | # A valid user id (as an integer) or the name of a user that
115 | # can be retrieved with a call to pwd.getpwnam(value) or None
116 | # to not change the worker process user.
117 | #
118 | # group - Switch worker process to run as this group.
119 | #
120 | # A valid group id (as an integer) or the name of a user that
121 | # can be retrieved with a call to pwd.getgrnam(value) or None
122 | # to change the worker processes group.
123 | #
124 | # umask - A mask for file permissions written by Gunicorn. Note that
125 | # this affects unix socket permissions.
126 | #
127 | # A valid value for the os.umask(mode) call or a string
128 | # compatible with int(value, 0) (0 means Python guesses
129 | # the base, so values like "0", "0xFF", "0022" are valid
130 | # for decimal, hex, and octal representations)
131 | #
132 | # tmp_upload_dir - A directory to store temporary request data when
133 | # requests are read. This will most likely be disappearing soon.
134 | #
135 | # A path to a directory where the process owner can write. Or
136 | # None to signal that Python should choose one on its own.
137 | #
138 |
139 | #
140 | # Logging
141 | #
142 | # logfile - The path to a log file to write to.
143 | #
144 | # A path string. "-" means log to stdout.
145 | #
146 | # loglevel - The granularity of log output
147 | #
148 | # A string of "debug", "info", "warning", "error", "critical"
149 | #
150 |
151 | errorlog = '-'
152 | loglevel = 'error'
153 | accesslog = '-'
154 | access_log_format = '{"request_address": "%(h)s", ' \
155 | '"request_time": "%(t)s", ' \
156 | '"request": "%(r)s", ' \
157 | '"http_status_code": "%(s)s", ' \
158 | '"http_request_url": "%(U)s", ' \
159 | '"http_query_string": "%(q)s", ' \
160 | '"request_headers": {' \
161 | '"content-type": "%({content-type}i)s", ' \
162 | '"content-length": "%({content-length}i)s", ' \
163 | '"user-agent": "%(a)s"' \
164 | '}}'
165 |
--------------------------------------------------------------------------------
/services/engine/gunicorn_logging.ini:
--------------------------------------------------------------------------------
1 | # Logging configuration
2 |
3 | [loggers]
4 | keys = root, gunicorn.access, gunicorn.error
5 |
6 | [handlers]
7 | keys = access, error, sentry
8 |
9 | [formatters]
10 | keys = json, generic
11 |
12 | # Root logger
13 | # The root logger sends messages to the console and to Sentry.
14 | [logger_root]
15 | handlers = error, sentry
16 |
17 | # Gunicorn loggers
18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'.
19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation.
20 | # Only the critical logger has a handler to send messages to Sentry.
21 |
22 | [logger_gunicorn.access]
23 | level = INFO
24 | handlers = access
25 | propagate = 0
26 | qualname = gunicorn.access
27 |
28 | [logger_gunicorn.error]
29 | level = ERROR
30 | handlers = error, sentry
31 | propagate = 0
32 | qualname = gunicorn.error
33 |
34 | # Handlers
35 | [handler_access]
36 | class = StreamHandler
37 | formatter = json
38 | args = (sys.stdout, )
39 |
40 | [handler_error]
41 | class = StreamHandler
42 | formatter = json
43 | args = (sys.stderr,)
44 |
45 | [handler_sentry]
46 | class = raven.handlers.logging.SentryHandler
47 | level = ERROR
48 | formatter = generic
49 | sentry_dsn = example
50 | args = [%(sentry_dsn)s]
51 |
52 | [formatter_generic]
53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s
54 | [formatter_json]
55 | class = webs.api.utils.loggers.JSONFormatter
--------------------------------------------------------------------------------
/services/engine/manage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import click
4 | from flask.cli import FlaskGroup
5 | from webs import create_app
6 |
7 | app = create_app()
8 | cli = FlaskGroup(create_app=create_app)
9 |
10 |
11 | @cli.command('add_spider_server')
12 | @click.argument('address')
13 | def _add_spider_server(address):
14 | from webs.api.utils.helper import add_spider_server
15 | add_spider_server(address)
16 |
17 |
18 | if __name__ == '__main__':
19 | cli()
20 |
--------------------------------------------------------------------------------
/services/engine/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
--------------------------------------------------------------------------------
/services/engine/migrations/alembic.ini:
--------------------------------------------------------------------------------
1 | # A generic, single database configuration.
2 |
3 | [alembic]
4 | # template used to generate migration files
5 | # file_template = %%(rev)s_%%(slug)s
6 |
7 | # set to 'true' to run the environment during
8 | # the 'revision' command, regardless of autogenerate
9 | # revision_environment = false
10 |
11 |
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic
15 |
16 | [handlers]
17 | keys = console
18 |
19 | [formatters]
20 | keys = generic
21 |
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 |
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 |
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 |
37 | [handler_console]
38 | class = StreamHandler
39 | args = (sys.stderr,)
40 | level = NOTSET
41 | formatter = generic
42 |
43 | [formatter_generic]
44 | format = %(levelname)-5.5s [%(name)s] %(message)s
45 | datefmt = %H:%M:%S
46 |
--------------------------------------------------------------------------------
/services/engine/migrations/env.py:
--------------------------------------------------------------------------------
1 | from __future__ import with_statement
2 |
3 | import logging
4 | from logging.config import fileConfig
5 |
6 | from sqlalchemy import engine_from_config
7 | from sqlalchemy import pool
8 |
9 | from alembic import context
10 |
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 |
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | fileConfig(config.config_file_name)
18 | logger = logging.getLogger('alembic.env')
19 |
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | from flask import current_app
25 | config.set_main_option('sqlalchemy.url',
26 | current_app.config.get('SQLALCHEMY_DATABASE_URI'))
27 | target_metadata = current_app.extensions['migrate'].db.metadata
28 |
29 | # other values from the config, defined by the needs of env.py,
30 | # can be acquired:
31 | # my_important_option = config.get_main_option("my_important_option")
32 | # ... etc.
33 |
34 |
35 | def run_migrations_offline():
36 | """Run migrations in 'offline' mode.
37 |
38 | This configures the context with just a URL
39 | and not an Engine, though an Engine is acceptable
40 | here as well. By skipping the Engine creation
41 | we don't even need a DBAPI to be available.
42 |
43 | Calls to context.execute() here emit the given string to the
44 | script output.
45 |
46 | """
47 | url = config.get_main_option("sqlalchemy.url")
48 | context.configure(
49 | url=url, target_metadata=target_metadata, literal_binds=True
50 | )
51 |
52 | with context.begin_transaction():
53 | context.run_migrations()
54 |
55 |
56 | def run_migrations_online():
57 | """Run migrations in 'online' mode.
58 |
59 | In this scenario we need to create an Engine
60 | and associate a connection with the context.
61 |
62 | """
63 |
64 | # this callback is used to prevent an auto-migration from being generated
65 | # when there are no changes to the schema
66 | # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
67 | def process_revision_directives(context, revision, directives):
68 | if getattr(config.cmd_opts, 'autogenerate', False):
69 | script = directives[0]
70 | if script.upgrade_ops.is_empty():
71 | directives[:] = []
72 | logger.info('No changes in schema detected.')
73 |
74 | connectable = engine_from_config(
75 | config.get_section(config.config_ini_section),
76 | prefix='sqlalchemy.',
77 | poolclass=pool.NullPool,
78 | )
79 |
80 | with connectable.connect() as connection:
81 | context.configure(
82 | connection=connection,
83 | target_metadata=target_metadata,
84 | process_revision_directives=process_revision_directives,
85 | **current_app.extensions['migrate'].configure_args
86 | )
87 |
88 | with context.begin_transaction():
89 | context.run_migrations()
90 |
91 |
92 | if context.is_offline_mode():
93 | run_migrations_offline()
94 | else:
95 | run_migrations_online()
96 |
--------------------------------------------------------------------------------
/services/engine/migrations/script.py.mako:
--------------------------------------------------------------------------------
1 | """${message}
2 |
3 | Revision ID: ${up_revision}
4 | Revises: ${down_revision | comma,n}
5 | Create Date: ${create_date}
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 |
18 |
19 | def upgrade():
20 | ${upgrades if upgrades else "pass"}
21 |
22 |
23 | def downgrade():
24 | ${downgrades if downgrades else "pass"}
25 |
--------------------------------------------------------------------------------
/services/engine/migrations/versions/1569921cac58_加入响应时间和渲染时间.py:
--------------------------------------------------------------------------------
1 | """加入响应时间和渲染时间
2 |
3 | Revision ID: 1569921cac58
4 | Revises: b3bd5bc9e4e3
5 | Create Date: 2021-04-07 17:11:13.336649
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 |
11 | # revision identifiers, used by Alembic.
12 | revision = '1569921cac58'
13 | down_revision = 'b3bd5bc9e4e3'
14 | branch_labels = None
15 | depends_on = None
16 |
17 |
18 | def upgrade():
19 | # ### commands auto generated by Alembic - please adjust! ###
20 | op.add_column('results', sa.Column('load_complete_time', sa.Integer(), nullable=True))
21 | op.add_column('results', sa.Column('response_time', sa.Integer(), nullable=True))
22 | # ### end Alembic commands ###
23 |
24 |
25 | def downgrade():
26 | # ### commands auto generated by Alembic - please adjust! ###
27 | op.drop_column('results', 'response_time')
28 | op.drop_column('results', 'load_complete_time')
29 | # ### end Alembic commands ###
30 |
--------------------------------------------------------------------------------
/services/engine/migrations/versions/5b189e0161ee_加入网站编码.py:
--------------------------------------------------------------------------------
1 | """加入网站编码
2 |
3 | Revision ID: 5b189e0161ee
4 | Revises: 1569921cac58
5 | Create Date: 2021-04-19 10:32:15.201074
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 |
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '5b189e0161ee'
14 | down_revision = '1569921cac58'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('results', sa.Column('charset', sa.String(length=256), nullable=True))
22 | # ### end Alembic commands ###
23 |
24 |
25 | def downgrade():
26 | # ### commands auto generated by Alembic - please adjust! ###
27 | op.drop_column('results', 'charset')
28 | # ### end Alembic commands ###
29 |
--------------------------------------------------------------------------------
/services/engine/migrations/versions/71bf761944f8_新增网站图标字段.py:
--------------------------------------------------------------------------------
1 | """新增网站图标字段
2 |
3 | Revision ID: 71bf761944f8
4 | Revises: 4a243739ef84
5 | Create Date: 2021-01-21 16:39:56.687514
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 |
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '71bf761944f8'
14 | down_revision = '4a243739ef84'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('results', sa.Column('favicon_link', sa.String(length=1024), nullable=True))
22 | op.add_column('results', sa.Column('favicon_md5', sa.String(length=50), nullable=True))
23 | # ### end Alembic commands ###
24 |
25 |
26 | def downgrade():
27 | # ### commands auto generated by Alembic - please adjust! ###
28 | op.drop_column('results', 'favicon_md5')
29 | op.drop_column('results', 'favicon_link')
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/services/engine/migrations/versions/b3bd5bc9e4e3_增加extra客户端额外数据.py:
--------------------------------------------------------------------------------
1 | """增加extra客户端额外数据
2 |
3 | Revision ID: b3bd5bc9e4e3
4 | Revises: 71bf761944f8
5 | Create Date: 2021-01-25 17:52:28.285830
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 |
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = 'b3bd5bc9e4e3'
14 | down_revision = '71bf761944f8'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('tasks', sa.Column('extra_data', sa.Text(), nullable=True))
22 | # ### end Alembic commands ###
23 |
24 |
25 | def downgrade():
26 | # ### commands auto generated by Alembic - please adjust! ###
27 | op.drop_column('tasks', 'extra_data')
28 | # ### end Alembic commands ###
29 |
--------------------------------------------------------------------------------
/services/engine/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.7.2
2 | alembic==1.4.3
3 | amqp==2.6.1
4 | APScheduler==3.6.3
5 | async-timeout==3.0.1
6 | attrs==20.3.0
7 | beautifulsoup4==4.9.3
8 | billiard==3.6.3.0
9 | celery==4.3.0
10 | certifi==2020.11.8
11 | chardet==3.0.4
12 | click==7.1.2
13 | configobj==5.0.6
14 | Cython==0.29.21
15 | Flask==1.1.2
16 | Flask-Migrate==2.4.0
17 | Flask-Redis==0.3.0
18 | Flask-SQLAlchemy==2.3.2
19 | gevent==1.4.0
20 | greenlet==0.4.15
21 | grpcio==1.33.2
22 | grpcio-tools==1.33.2
23 | gunicorn==19.9.0
24 | html2text==2020.1.16
25 | idna==2.8
26 | importlib-metadata==2.0.0
27 | itsdangerous==1.1.0
28 | Jinja2==2.11.2
29 | kombu==4.6.11
30 | Mako==1.1.3
31 | MarkupSafe==1.1.1
32 | marshmallow==2.19.2
33 | multidict==5.0.2
34 | Pillow==8.2.0
35 | plumbum==1.6.9
36 | protobuf==3.12.2 --no-binary protobuf
37 | psycopg2-binary==2.7.6.1
38 | python-dateutil==2.8.1
39 | python-editor==1.0.4
40 | pytz==2020.4
41 | raven==6.10.0
42 | redis==3.5.3
43 | requests==2.22.0
44 | rpyc==4.1.5
45 | six==1.15.0
46 | soupsieve==2.1
47 | SQLAlchemy==1.3.20
48 | typing-extensions==3.7.4.3
49 | tzlocal==2.1
50 | urllib3==1.25.11
51 | vine==1.3.0
52 | webargs==4.0.0
53 | Werkzeug==1.0.1
54 | yarl==1.6.3
55 | zipp==3.4.0
56 |
--------------------------------------------------------------------------------
/services/engine/rpc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/rpc/client/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/rpc/client/callback_client.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import os
4 |
5 | import grpc
6 |
7 | from rpc.pb import callback_pb2
8 | from rpc.pb.callback_pb2_grpc import ResultStub
9 |
10 | CHUNK_SIZE = 10 * 1024
11 |
12 |
13 | class CallbackClient(object):
14 |
15 | def __init__(self, rpc_server):
16 | # RPC服务器信道
17 | channel = grpc.insecure_channel(target=f'{rpc_server}', options=[
18 | ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024),
19 | ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024),
20 | ])
21 |
22 | # 获取Result grpc服务对象
23 | self.stub = ResultStub(channel)
24 |
25 | def callback_save_result(self, task_obj, result):
26 | """
27 | 回调爬虫信息
28 | :return:
29 | """
30 | result['extra_data'] = task_obj.extra_data
31 | self.stub.SaveResult(
32 | callback_pb2.SaveResultRequest(
33 | customer_id=task_obj.customer_id,
34 | task_id=task_obj.id,
35 | task_status='executing',
36 | finished=False,
37 | crawl_results=json.dumps(result, ensure_ascii=False)
38 | ),
39 | timeout=30
40 | )
41 |
42 | def callback_task_finished(self, customer_id, task_id):
43 | """回调任务完成"""
44 |
45 | self.stub.SaveResult(
46 | callback_pb2.SaveResultRequest(
47 | customer_id=customer_id, task_id=task_id,
48 | finished=True, task_status='finished'),
49 | timeout=30
50 | )
51 |
--------------------------------------------------------------------------------
/services/engine/rpc/codegen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # 声明proto路径和pb文件生成路径
4 | declare -a proto_path=("protos")
5 | declare -a python_out=("pb")
6 |
7 | # 构造pb文件
8 | python -m grpc_tools.protoc \
9 | --proto_path=$proto_path/ \
10 | --python_out=$python_out \
11 | --grpc_python_out=$python_out \
12 | $proto_path/*.proto
13 |
14 | # 替换pb文件的错误引入语句
15 | sed -i '' -E 's/^import (.*pb2)/from . import \1/g' ${python_out}/*pb2*.py
--------------------------------------------------------------------------------
/services/engine/rpc/pb/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/rpc/pb/callback_pb2_grpc.py:
--------------------------------------------------------------------------------
1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2 | """Client and server classes corresponding to protobuf-defined services."""
3 | import grpc
4 |
5 | from . import callback_pb2 as result__pb2
6 |
7 |
8 | class ResultStub(object):
9 | """定义服务
10 | """
11 |
12 | def __init__(self, channel):
13 | """Constructor.
14 |
15 | Args:
16 | channel: A grpc.Channel.
17 | """
18 | self.SaveResult = channel.unary_unary(
19 | '/result.Result/SaveResult',
20 | request_serializer=result__pb2.SaveResultRequest.SerializeToString,
21 | response_deserializer=result__pb2.SaveResultResponse.FromString,
22 | )
23 |
24 |
25 | class ResultServicer(object):
26 | """定义服务
27 | """
28 |
29 | def SaveResult(self, request, context):
30 | """保存基本爬取信息
31 | """
32 | context.set_code(grpc.StatusCode.UNIMPLEMENTED)
33 | context.set_details('Method not implemented!')
34 | raise NotImplementedError('Method not implemented!')
35 |
36 |
37 | def add_ResultServicer_to_server(servicer, server):
38 | rpc_method_handlers = {
39 | 'SaveResult': grpc.unary_unary_rpc_method_handler(
40 | servicer.SaveResult,
41 | request_deserializer=result__pb2.SaveResultRequest.FromString,
42 | response_serializer=result__pb2.SaveResultResponse.SerializeToString,
43 | ),
44 | }
45 | generic_handler = grpc.method_handlers_generic_handler(
46 | 'result.Result', rpc_method_handlers)
47 | server.add_generic_rpc_handlers((generic_handler,))
48 |
49 |
50 | # This class is part of an EXPERIMENTAL API.
51 | class Result(object):
52 | """定义服务
53 | """
54 |
55 | @staticmethod
56 | def SaveResult(request,
57 | target,
58 | options=(),
59 | channel_credentials=None,
60 | call_credentials=None,
61 | insecure=False,
62 | compression=None,
63 | wait_for_ready=None,
64 | timeout=None,
65 | metadata=None):
66 | return grpc.experimental.unary_unary(request, target, '/result.Result/SaveResult',
67 | result__pb2.SaveResultRequest.SerializeToString,
68 | result__pb2.SaveResultResponse.FromString,
69 | options, channel_credentials,
70 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
71 |
--------------------------------------------------------------------------------
/services/engine/rpc/protos/result.proto:
--------------------------------------------------------------------------------
1 | // 使用proto3协议
2 | syntax = "proto3";
3 |
4 | // 当前包
5 | package result;
6 |
7 |
8 | // 定义服务
9 | service Result {
10 | // 保存基本爬取信息
11 | rpc SaveBaseResult(SaveBaseResultRequest) returns (SaveBaseResultResponse);
12 | // 基于流式传输图片
13 | rpc StreamUploadPicture(stream StreamUploadPictureRequest) returns (StreamUploadPictureResponse);
14 | // 基于流式传输har文件
15 | rpc StreamUploadHarFile(stream StreamUploadHarFileRequest) returns (StreamUploadHarFileResponse);
16 | // 标记子任务爬取状态
17 | rpc SetSubTaskStatus(SetSubTaskStatusRequest) returns (SetSubTaskStatusResponse);
18 | }
19 |
20 |
21 | // 请求参数
22 | message SaveBaseResultRequest {
23 | int32 subtask_id = 1;
24 | int32 url_id = 2;
25 | string url_address = 3;
26 | int32 http_code = 4;
27 | string title = 5;
28 | string content = 6;
29 | string current_url = 7;
30 | string screenshot_id = 8;
31 | string response_headers = 9;
32 | string finished_at = 10;
33 | string har_uuid = 11;
34 | repeated RedirectChain redirect_chain = 12;
35 | repeated Cookies cookies = 13;
36 | int32 response_time = 14;
37 | int32 load_complete_time = 15;
38 | }
39 |
40 |
41 | // 网站重定向链接链表
42 | message RedirectChain {
43 | string redirect_url = 1;
44 | int32 redirect_http_code = 2;
45 | }
46 |
47 |
48 | // Cookies
49 | message Cookies {
50 | string name = 1;
51 | string path = 2;
52 | string value = 3;
53 | string domain = 4;
54 | bool secure = 5;
55 | int64 expires = 6;
56 | bool httpOnly = 7;
57 | string sameSite = 8;
58 | }
59 |
60 |
61 | // 响应
62 | message SaveBaseResultResponse{
63 | bool status = 1;
64 | }
65 |
66 | // 图片流
67 | message StreamUploadPictureRequest {
68 | message FileData {
69 | string filename = 1;
70 | bytes buffer = 2;
71 | }
72 | oneof payload {
73 | string filename = 1;
74 | FileData file_data = 2;
75 | }
76 | }
77 |
78 | // 图片长度
79 | message StreamUploadPictureResponse {
80 | int32 length = 1;
81 | }
82 |
83 | // 子任务id、状态
84 | message SetSubTaskStatusRequest {
85 | int32 subtask_id = 1;
86 | bool status = 2;
87 | string finished_at = 3;
88 | }
89 |
90 | // 设置子任务状态响应
91 | message SetSubTaskStatusResponse {
92 | bool set_success = 1;
93 | }
94 |
95 | // har文件流
96 | message StreamUploadHarFileRequest {
97 | message FileData {
98 | string filename = 1;
99 | bytes buffer = 2;
100 | }
101 | oneof payload {
102 | string filename = 1;
103 | FileData file_data = 2;
104 | }
105 | }
106 |
107 | // har文件流大小
108 | message StreamUploadHarFileResponse {
109 | int32 length = 1;
110 | }
--------------------------------------------------------------------------------
/services/engine/rpc/server/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/rpc/server/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | from google.protobuf.json_format import MessageToDict
6 |
7 | from manage import app
8 | from rpc.pb import result_pb2, result_pb2_grpc
9 | from webs.api.models.db_proxy import subtask_model_proxy, task_model_proxy, schedule_task_proxy, apscheduler_model_proxy
10 | from worker import celery_app
11 |
12 |
13 | def save_chunks_to_file(request_streams, folder_path):
14 | filename, file_chunks = None, []
15 | for yield_obj in request_streams:
16 | if getattr(yield_obj, 'filename'):
17 | filename = yield_obj.filename
18 | else:
19 | file_chunks.append(yield_obj.file_data)
20 | with open(f'/usr/src/app/{folder_path}/{filename}', 'wb') as f:
21 | for chunk in file_chunks:
22 | f.write(chunk.buffer)
23 | return os.path.getsize(f'/usr/src/app/{folder_path}/{filename}')
24 |
25 |
26 | class ResultServicer(result_pb2_grpc.ResultServicer):
27 | # 创建截图目录
28 | if not os.path.exists('/usr/src/app/screenshots'):
29 | os.mkdir('/usr/src/app/screenshots')
30 |
31 | # 实现SaveBaseResult方法的rpc调用
32 | def SaveBaseResult(self, request, context):
33 | # proto消息体参数转为字典
34 | request_dict = MessageToDict(request, preserving_proto_field_name=True)
35 |
36 | # 异步处理相关爬取数据
37 | celery_app.send_task('save_base_result', queue='base_result', kwargs=request_dict)
38 |
39 | return result_pb2.SaveBaseResultResponse(status=True)
40 |
41 | # 实现StreamUploadPicture流式处理传输图片的rpc调用
42 | def StreamUploadPicture(self, request_iterator, context):
43 | try:
44 | file_length = save_chunks_to_file(request_iterator, folder_path='screenshots')
45 | except FileNotFoundError:
46 | file_length = 0
47 | return result_pb2.StreamUploadPictureResponse(length=file_length)
48 |
49 | # 实现SetSubTaskStatus标记子任务处理状态
50 | def SetSubTaskStatus(self, request, context):
51 | # 在flask上下文中设置子任务状态
52 | with app.app_context():
53 | ###### 设置子任务状态
54 | subtask_obj = subtask_model_proxy.set_many_attr(
55 | obj_id=request.subtask_id,
56 | fields_v={'finished': request.status, 'finished_at': request.finished_at}
57 | )
58 | ###### 设置调度任务状态
59 | unfinished_count = subtask_model_proxy.query_unfinished_subtask_count(subtask_obj.schedule_task_id)
60 | if unfinished_count == 0:
61 | schedule_task_obj = schedule_task_proxy.query_schedule_task_obj_by_subtask_id(subtask_obj.id)
62 | schedule_task_proxy.set_many_attr(
63 | obj=schedule_task_obj, fields_v={'schedule_task_status': 'finished', 'finished': True}
64 | )
65 |
66 | # 查询主任务
67 | task_id, running_schedule_tasks = schedule_task_proxy.query_running_task_and_task_id(
68 | subtask_obj.schedule_task_id)
69 | task_obj = task_model_proxy.find(id=task_id)
70 |
71 | # 回调当前调度任务完成
72 | if schedule_task_obj.crawl_options.get('callback_type'):
73 | from rpc.client.callback_client import CallbackClient
74 | try:
75 | callback_client = CallbackClient(rpc_server=task_obj.crawl_options.get('callback_address'))
76 | callback_client.callback_task_finished(customer_id=task_obj.customer_id, task_id=task_id)
77 | except Exception as e:
78 | print(e)
79 | print(f"回调任务完成失败:ID-{task_id}")
80 |
81 | # 设置主任务为完结状态
82 | next_run_time = apscheduler_model_proxy.get_next_run_time(apschedule_id=task_id)
83 | if not running_schedule_tasks and not next_run_time:
84 | task_model_proxy.set_many_attr(
85 | obj=task_obj, fields_v={'task_status': 'finished', 'finished': True}
86 | )
87 |
88 | return result_pb2.SetSubTaskStatusResponse(set_success=True)
89 |
90 | # 实现StreamUploadHarFile流式处理传输文件的rpc调用
91 | def StreamUploadHarFile(self, request_iterator, context):
92 | try:
93 | file_length = save_chunks_to_file(request_iterator, folder_path='hars')
94 | except FileNotFoundError:
95 | file_length = 0
96 | return result_pb2.StreamUploadPictureResponse(length=file_length)
97 |
--------------------------------------------------------------------------------
/services/engine/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
--------------------------------------------------------------------------------
/services/engine/wappalyzer/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from wappalyzer.wappalyzerhandler import WappalyzerHandler
4 |
5 | wappalyzer_handler = WappalyzerHandler(techno_path='wappalyzer/data.json')
6 |
--------------------------------------------------------------------------------
/services/engine/wappalyzer/helper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from bs4 import BeautifulSoup
4 |
5 | from typing import Any
6 | import re
7 |
8 | from wappalyzer.modelcalss import Pattern
9 |
10 |
11 | def _transform_patterns(
12 | patterns: Any,
13 | case_sensitive: bool = False
14 | ):
15 | """Canonicalize the patterns of different sections.
16 | """
17 |
18 | def to_list(value):
19 | return value if type(value) is list else [value]
20 |
21 | if not patterns:
22 | return []
23 |
24 | if type(patterns) is str or type(patterns) is list:
25 | patterns = {
26 | "main": patterns
27 | }
28 |
29 | parsed = {}
30 | for key in patterns:
31 | name = key if case_sensitive else key.lower()
32 | parsed[name] = [
33 | _parse_pattern(ptrn, key)
34 | for ptrn in to_list(patterns[key])
35 | ]
36 |
37 | return parsed["main"] if "main" in parsed else parsed
38 |
39 |
40 | def _parse_pattern(pattern: str, key: str = ""):
41 | """Parse the regex pattern and creates a Pattern object.
42 | It extracts the regex, the version and the confidence values of
43 | the given string.
44 | """
45 | parts = pattern.split("\\;")
46 |
47 | value = parts[0]
48 |
49 | # seems that in js "[^]" is similar to ".", however python
50 | # re interprets in a diferent way (which leads to an error),
51 | # so it is better to substitute it
52 | regex = value.replace("/", "\\/").replace("[^]", ".")
53 |
54 | attrs = {
55 | "value": value,
56 | "regex": re.compile(regex, re.I)
57 | }
58 | for attr in parts[1:]:
59 | attr = attr.split(":")
60 | if len(attr) > 1:
61 | attrs[attr[0]] = ":".join(attr[1:])
62 |
63 | return Pattern(
64 | value=attrs["value"],
65 | regex=attrs["regex"],
66 | confidence=int(attrs.get("confidence", 100)),
67 | version=attrs.get("version", ""),
68 | key=key,
69 | )
70 |
71 |
72 | def extract_scripts(html: str):
73 | soup = BeautifulSoup(html, "html.parser")
74 | script_tags = soup.findAll("script")
75 |
76 | scripts = []
77 | for script_tag in script_tags:
78 | try:
79 | src = script_tag.attrs["src"]
80 | if not src.startswith("data:text/javascript;"):
81 | scripts.append(src)
82 | except KeyError:
83 | pass
84 |
85 | return scripts
86 |
87 |
88 | def extract_metas(html: str):
89 | soup = BeautifulSoup(html, "html.parser")
90 | meta_tags = soup.findAll("meta")
91 |
92 | metas = {}
93 | for meta_tag in meta_tags:
94 | try:
95 | key = meta_tag.attrs.get("name", None) \
96 | or meta_tag.attrs["property"]
97 | metas[key.lower()] = [meta_tag.attrs["content"]]
98 | except KeyError:
99 | continue
100 |
101 | return metas
102 |
103 |
104 | def extract_cookies(cookies_list):
105 | cookies_dict = {}
106 | for each_cookie in cookies_list:
107 | cookies_dict.update({each_cookie['name']: each_cookie['value']})
108 | return cookies_dict
109 |
110 |
111 | def extract_headers(headers):
112 | return {
113 | k.lower(): [v]
114 | for k, v in headers.items()
115 | }
116 |
--------------------------------------------------------------------------------
/services/engine/wappalyzer/modelcalss.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class Technology:
4 | def __init__(
5 | self,
6 | name: str,
7 | categories,
8 | url,
9 | headers,
10 | cookies,
11 | html,
12 | meta,
13 | scripts,
14 | js,
15 | implies,
16 | excludes,
17 | icon: str,
18 | website: str,
19 | cpe: str,
20 | ):
21 | self.name = name
22 | self.categories = categories
23 | self.url = url
24 | self.headers = headers
25 | self.cookies = cookies
26 | self.html = html
27 | self.meta = meta
28 | self.scripts = scripts
29 | self.js = js
30 | self.implies = implies
31 | self.excludes = excludes
32 | self.icon = icon
33 | self.website = website
34 | self.cpe = cpe
35 |
36 | def __getitem__(self, k):
37 | return self.__dict__[k]
38 |
39 | def get(self, *args, **kwargs):
40 | return self.__dict__.get(*args, **kwargs)
41 |
42 | def __repr__(self):
43 | return repr(self.__dict__)
44 |
45 |
46 | class Category:
47 | def __init__(self, id: str, name: str, priority: int):
48 | self.id = id
49 | self.name = name
50 | self.priority = priority
51 |
52 |
53 | class Pattern:
54 |
55 | def __init__(
56 | self,
57 | value: str,
58 | regex,
59 | confidence: int,
60 | version: str,
61 | key: str
62 | ):
63 | self.value = value
64 | self.regex = regex
65 | self.confidence = confidence
66 | self.version = version
67 | self.key = key
68 |
69 | def __getitem__(self, k):
70 | return self.__dict__[k]
71 |
72 | def __repr__(self):
73 | return repr(self.__dict__)
74 |
75 |
76 | class Imply:
77 | """Structure to define a technology that is implied by the use of another
78 | one.
79 |
80 | Attributes:
81 | name (str): Name of the implied technology.
82 | confidence (int): Confidence of the implied technology.
83 |
84 | """
85 |
86 | def __init__(self, name: str, confidence: int):
87 | self.name = name
88 | self.confidence = confidence
89 |
90 |
91 | class Exclude:
92 | """Structure to define a technology that is incompatible with another
93 | one.
94 |
95 | Attributes:
96 | name (str): Name of the excluded technology.
97 |
98 | """
99 |
100 | def __init__(self, name: str):
101 | self.name = name
102 |
103 |
104 | class PatternMatch:
105 | """Identifies a match in a technology pattern.
106 |
107 | Attributes:
108 | technology (Technology): Technology identified by the pattern.
109 | pattern (Pattern): Pattern that cause the match.
110 | version (str): Version identified by the pattern in the match.
111 | """
112 |
113 | def __init__(self, technology: Technology, pattern: Pattern, version: str):
114 | self.technology = technology
115 | self.pattern = pattern
116 | self.version = version
117 |
118 | def __getitem__(self, k):
119 | return self.__dict__[k]
120 |
121 | def __repr__(self):
122 | return repr(self.__dict__)
123 |
124 | def __eq__(self, o):
125 | return (
126 | self.technology.name == o.technology.name
127 | and self.pattern.key == self.pattern.key
128 | and self.pattern.value == self.pattern.value
129 | )
130 |
131 | def __hash__(self):
132 | return hash(
133 | (self.technology.name, self.pattern.key, self.pattern.value)
134 | )
135 |
136 |
137 | class TechMatch:
138 | """Identifies a match in a technology.
139 |
140 | Attributes:
141 | technology (Technology): Technology identified.
142 | confidence (int): Confidence in the match, is derivated from all the
143 | patterns of this technology that matched.
144 | version (str): Version identified by the patterns.
145 | """
146 |
147 | def __init__(self, technology: Technology, confidence: int, version: str):
148 | self.technology = technology
149 | self.confidence = confidence
150 | self.version = version
151 |
152 | def __getitem__(self, k):
153 | return self.__dict__[k]
154 |
155 | def __repr__(self):
156 | return repr(self.__dict__)
157 |
158 | def __eq__(self, o):
159 | return self.technology.name == o.technology.name
160 |
--------------------------------------------------------------------------------
/services/engine/webs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | from flask import Flask
6 |
7 | from webs.api.utils.requests import before_request_middleware, \
8 | after_request_middleware, teardown_appcontext_middleware
9 | from webs.api.utils.responses import JSONResponse, app_error_handler
10 | from webs.api.utils.routers import register_routes as init_routes
11 | from webs.api.utils.settings import init_db
12 |
13 |
14 | def create_app():
15 | # instantiate the app
16 | app = Flask(__name__)
17 |
18 | # set config
19 | app_settings = os.getenv('APP_SETTINGS')
20 | app.config.from_object(app_settings)
21 |
22 | # register all blueprints
23 | init_routes(app=app)
24 |
25 | # register custom response class
26 | app.response_class = JSONResponse
27 |
28 | # register custom error handler
29 | app_error_handler(app=app)
30 |
31 | # register before request middleware
32 | before_request_middleware(app=app)
33 |
34 | # register after request middleware
35 | after_request_middleware(app=app)
36 |
37 | # register after app context teardown middleware
38 | teardown_appcontext_middleware(app=app)
39 |
40 | # set up extensions
41 | app_db = init_db(app=app)
42 |
43 | # shell context for flask cli
44 | @app.shell_context_processor
45 | def ctx():
46 | return {'app': app, 'db': app_db}
47 |
48 | return app
49 |
--------------------------------------------------------------------------------
/services/engine/webs/api/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/api/bizs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/api/bizs/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import base64
4 | import os
5 | from io import BytesIO
6 |
7 | from PIL import Image
8 | from flask import make_response, send_file
9 |
10 | from webs.api.exceptions.customs import RecordNotFound, InvalidAPIRequest
11 | from webs.api.models.db_proxy import result_model_proxy
12 |
13 |
14 | class ResultBiz(object):
15 |
16 | def result_by_url(self, url, fields):
17 | """
18 | 根据url查询结果
19 | :param url:
20 | :param fields:
21 | :return:
22 | """
23 |
24 | # 获取url id
25 |
26 | return result_model_proxy.get_by_url(url, fields)
27 |
28 | def result_by_id(self, result_id):
29 | """
30 | 根据result查询结果
31 | :param result_id:
32 | :return:
33 | """
34 | return result_model_proxy.get_by_result_id(result_id)
35 |
36 | def get_screenshot(self, screenshot_id, download=False):
37 | """
38 | 获取截图
39 | :param screenshot_id:
40 | :param download:
41 | :return:
42 | """
43 |
44 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
45 | if not os.path.exists(screenshot_path):
46 | raise RecordNotFound('截图不存在!')
47 | response = make_response(send_file(
48 | filename_or_fp=screenshot_path,
49 | as_attachment=download
50 | ))
51 | response.direct_passthrough = False
52 | return response
53 |
54 | def get_screenshot_base64_encode(self, screenshot_id):
55 | """
56 | 获取截图Base64编码
57 | :param screenshot_id:
58 | :return:
59 | """
60 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
61 | if not os.path.exists(screenshot_path):
62 | raise RecordNotFound('截图不存在!')
63 | with open(screenshot_path, 'rb') as f:
64 | img_encode_str = base64.b64encode(f.read()).decode('utf-8')
65 | return img_encode_str
66 |
67 | def download_har(self, har_uuid):
68 | """
69 | 下载har文件
70 | :param har_uuid:
71 | :return:
72 | """
73 | har_path = f'/usr/src/app/hars/{har_uuid}.json'
74 | if not os.path.exists(har_path):
75 | raise RecordNotFound('该文件不存在!')
76 | response = make_response(send_file(
77 | filename_or_fp=har_path,
78 | as_attachment=True
79 | ))
80 | response.direct_passthrough = False
81 | return response
82 |
83 | def get_favicon(self, favicon_md5, download=False):
84 | """
85 | 获取图标
86 | :param favicon_md5:
87 | :param download:
88 | :return:
89 | """
90 |
91 | newest_record = result_model_proxy.find(favicon_md5=favicon_md5)
92 | if not newest_record:
93 | raise RecordNotFound('图标不存在!')
94 | _, ext = os.path.splitext(newest_record.favicon_link)
95 | favicon_path = f'/usr/src/app/screenshots/{favicon_md5}{ext}'
96 | if not os.path.exists(favicon_path):
97 | raise RecordNotFound('图标不存在!')
98 | response = make_response(send_file(
99 | filename_or_fp=favicon_path,
100 | as_attachment=download
101 | ))
102 | response.direct_passthrough = False
103 | return response
104 |
105 | def get_small_screenshot(self, screenshot_id, wide, high):
106 | """查看图片缩略图"""
107 |
108 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
109 | if not os.path.exists(screenshot_path):
110 | raise RecordNotFound('截图不存在!')
111 | im = Image.open(f'/usr/src/app/screenshots/{screenshot_id}.png')
112 |
113 | src_wide, src_high = im.size
114 | ratio = src_wide / wide
115 | im = im.resize((wide, int(src_high / ratio)), Image.ANTIALIAS)
116 | im = im.crop((0, 0, wide, high))
117 |
118 | # 存入临时内存中
119 | byte_io = BytesIO()
120 | im.save(byte_io, 'PNG')
121 | byte_io.seek(0)
122 |
123 | response = make_response(send_file(
124 | filename_or_fp=byte_io,
125 | as_attachment=False,
126 | mimetype='image/png'
127 | # attachment_filename=f'{screenshot_id}.png'
128 | ))
129 | response.direct_passthrough = False
130 | return response
131 |
--------------------------------------------------------------------------------
/services/engine/webs/api/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/api/exceptions/customs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from werkzeug.exceptions import BadRequest, \
5 | NotFound, Unauthorized, Forbidden, InternalServerError, Conflict
6 |
7 |
8 | class CustomException(Exception):
9 | """Custom JSON based exception."""
10 |
11 | status_code = BadRequest.code
12 | message = ""
13 |
14 | def __init__(self, message=None, status_code=None):
15 | """
16 | :param status_code: response status_code
17 | :param message: exception message
18 | """
19 |
20 | Exception.__init__(self)
21 |
22 | if message is not None:
23 | self.message = message
24 | if status_code is not None:
25 | self.status_code = status_code
26 |
27 | def to_dict(self):
28 | return {
29 | "status": False,
30 | "error": {
31 | "message": self.message,
32 | "type": str(self.__class__.__name__)
33 | }
34 | }
35 |
36 |
37 | class InvalidContentType(CustomException):
38 | """
39 | Raised when an invalid Content-Type is provided.
40 | """
41 |
42 | status_code = BadRequest.code
43 |
44 |
45 | class UnauthorizedAPIRequest(CustomException):
46 | """
47 | Raise if the user is not authorized. Also used if you want to use HTTP
48 | basic auth.
49 | """
50 |
51 | status_code = Unauthorized.code
52 |
53 |
54 | class InvalidPermissions(CustomException):
55 | """
56 | Raise if the user doesn't have the permission for the requested resource
57 | but was authenticated.
58 | """
59 |
60 | status_code = Forbidden.code
61 |
62 |
63 | class InvalidAPIRequest(CustomException):
64 | """
65 | Raised when an invalid request has been made.
66 | (e.g. accessed unexisting url, the schema validation did
67 | not pass)
68 | """
69 |
70 | status_code = BadRequest.code
71 |
72 |
73 | class ServerError(CustomException):
74 | """
75 | Generic internal error.
76 | Inherit this error for all subsequent
77 | errors that are related to database.
78 | """
79 |
80 | status_code = InternalServerError.code
81 |
82 |
83 | class DatabaseError(CustomException):
84 | """
85 | Generic database interaction error.
86 | Inherit this error for all subsequent
87 | errors that are related to database.
88 | """
89 |
90 | status_code = InternalServerError.code
91 |
92 |
93 | class RecordNotFound(DatabaseError):
94 | """
95 | Raised when the record was not found in the database.
96 | """
97 |
98 | status_code = NotFound.code
99 |
100 |
101 | class RecordAlreadyExists(DatabaseError):
102 | """
103 | Raised in the case of violation of a unique constraint.
104 | """
105 |
106 | status_code = Conflict.code
107 |
108 |
109 | class PublishError(CustomException):
110 | """
111 | Raised in the case of violation of a publish error.
112 | """
113 |
114 | status_code = InternalServerError.code
115 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from .base_model import db, redis_store
5 | from .task import Task, SubTask, ScheduleTaskRecord
6 | from .url import Url
7 | from .task_url import TaskUrl
8 | from .server import Server
9 | from .result import Result
10 | from .apscheduler_job import APSchedulerJobs
11 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/apscheduler_job.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from sqlalchemy import Column, types
5 | from sqlalchemy.dialects import postgresql
6 |
7 | from webs.api.models import db
8 |
9 | """
10 | APScheduler任务存储表
11 | """
12 |
13 |
14 | class APSchedulerJobs(db.Model):
15 | __tablename__ = 'apscheduler_jobs'
16 |
17 | id = Column(types.String(length=191), primary_key=True)
18 | next_run_time = Column(postgresql.DOUBLE_PRECISION(precision=53), index=True)
19 | job_state = Column(postgresql.BYTEA(), nullable=False)
20 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/base_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask_sqlalchemy import SQLAlchemy
4 | from flask_redis import FlaskRedis
5 |
6 | db = SQLAlchemy()
7 | redis_store = FlaskRedis()
8 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .apschedule import ApschedulerModelProxy
3 | from .result import ResultModelProxy
4 | from .schedule_task import ScheduleTaskProxy
5 | from .server import ServerModelProxy
6 | from .task import TaskModelProxy
7 | from .task_url import TaskUrlModelProxy
8 | from .url import UrlModelProxy
9 | from .subtask import SubTaskModelProxy
10 |
11 | task_model_proxy = TaskModelProxy()
12 | schedule_task_proxy = ScheduleTaskProxy()
13 | url_model_proxy = UrlModelProxy()
14 | task_url_model_proxy = TaskUrlModelProxy()
15 | server_model_proxy = ServerModelProxy()
16 | subtask_model_proxy = SubTaskModelProxy()
17 | result_model_proxy = ResultModelProxy()
18 | apscheduler_model_proxy = ApschedulerModelProxy()
19 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/apschedule.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import time
3 | from datetime import datetime
4 |
5 | from webs.api.models import APSchedulerJobs
6 | from webs.api.models.db_proxy.base import BaseModelProxy
7 |
8 |
9 | class ApschedulerModelProxy(BaseModelProxy):
10 | def __init__(self):
11 | super().__init__()
12 | self.model = APSchedulerJobs
13 |
14 | def get_next_run_time(self, apschedule_id):
15 | """
16 | 获取下一次任务执行时间
17 | :param apschedule_id:
18 | :return:
19 | """
20 | schedule_obj = self.find(id=str(apschedule_id))
21 | if schedule_obj and schedule_obj.next_run_time:
22 | return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(schedule_obj.next_run_time))
23 | return
24 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import Result
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class ResultModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = Result
11 |
12 | def save_model_by_grpc(self, **kwargs):
13 | """
14 | 基于rpc回收爬虫数据
15 | :param kwargs:
16 | :return:
17 | """
18 |
19 | obj = Result(
20 | subtask_id=kwargs['subtask_id'], url_id=kwargs['url_id'], url_address=kwargs['url_address'],
21 | http_code=kwargs.get('http_code'), title=kwargs.get('title'), content=kwargs.get('content'),
22 | current_url=kwargs.get('current_url'), redirect_chain=kwargs.get('redirect_chain', []),
23 | response_headers=kwargs.get('response_headers', {}), har_uuid=kwargs.get('har_uuid'),
24 | screenshot_id=kwargs.get('screenshot_id'), finished_at=kwargs['finished_at'],
25 | cookies=kwargs.get('cookies', []), wappalyzer_results=kwargs.get('wappalyzer_results', []),
26 | text=kwargs.get('text'), favicon_md5=kwargs.get('favicon_md5'), favicon_link=kwargs.get('favicon_link'),
27 | response_time=kwargs.get('response_time'), load_complete_time=kwargs.get('load_complete_time'),
28 | charset=kwargs.get('charset')
29 | )
30 | self.db_session.add(obj)
31 | self.db_session.flush()
32 | self.safe_commit()
33 | return obj
34 |
35 | def get_by_url(self, url, fields):
36 | """
37 | 基于url取结果
38 | :param url:
39 | :param fields:
40 | :return:
41 | """
42 |
43 | # obj = self.self_session.filter(
44 | # or_(self.model.url_address == url.rstrip('/'), self.model.url_address == url.rstrip('/') + '/')) \
45 | # .order_by(self.model.finished_at.desc()).first()
46 | from webs.api.models.db_proxy import url_model_proxy
47 | url_obj = url_model_proxy.find(address=url)
48 | if not url_obj:
49 | return {}
50 |
51 | # 查询所有记录
52 | objs = self.self_session.filter(self.model.url_id == url_obj.id) \
53 | .order_by(self.model.id.desc()).all()
54 |
55 | latest_record = {}
56 | if objs:
57 | latest_record = objs[0].as_dict()
58 | latest_record['other_records'] = [{
59 | 'result_id': each.id,
60 | 'finished_at': each.finished_at.strftime("%Y-%m-%d %H:%M:%S")}
61 | for each in objs[1:]
62 | ]
63 |
64 | if fields: latest_record = {each: latest_record[each] for each in fields if each in latest_record}
65 |
66 | return latest_record
67 |
68 | def get_by_result_id(self, result_id):
69 | """
70 | 基于id取结果
71 | :param result_id:
72 | :return:
73 | """
74 |
75 | obj = self.self_session.filter(self.model.id == result_id).order_by(self.model.finished_at.desc()).first()
76 | return {} if not obj else obj.as_dict()
77 |
78 | def get_favicon_data_by_url(self, url):
79 | """
80 | 根据url获取已存在的图标信息
81 | :param url:
82 | :return:
83 | """
84 |
85 | obj = self.db_session.query(self.model.favicon_md5, self.model.favicon_link) \
86 | .filter(self.model.url_address == url).order_by(self.model.create_time.desc()).first()
87 | return (None, None) if not obj else (obj[0], obj[1])
88 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/schedule_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import ScheduleTaskRecord, SubTask
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class ScheduleTaskProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = ScheduleTaskRecord
11 |
12 | def query_schedule_task_obj_by_subtask_id(self, subtask_id):
13 | """
14 | 基于子任务查询调度任务
15 | :return:
16 | """
17 |
18 | return self.self_session \
19 | .join(SubTask, SubTask.schedule_task_id == self.model.id) \
20 | .filter(SubTask.id == subtask_id).first()
21 |
22 | def query_running_schedule_tasks(self, task_id):
23 | """
24 | 查询正在执行中的调度任务
25 | :param task_id:
26 | :return:
27 | """
28 | return self.self_session.filter(
29 | self.model.task_id == task_id,
30 | self.model.finished.is_(False)
31 | ).all()
32 |
33 | def query_running_task_and_task_id(self, schedule_task_id):
34 | """
35 | 查询主任务下正在执行调度任务
36 | :param schedule_task_id:
37 | :return:
38 | """
39 | schedule_task_obj = self.find(id=schedule_task_id)
40 | return schedule_task_obj.task_id, self.query_running_schedule_tasks(schedule_task_obj.task_id)
41 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sqlalchemy import desc, asc
3 |
4 | from webs.api.models import Server
5 | from webs.api.models.db_proxy.base import BaseModelProxy
6 |
7 |
8 | class ServerModelProxy(BaseModelProxy):
9 | def __init__(self):
10 | super().__init__()
11 | self.model = Server
12 |
13 | def query_servers_by_score(self, sort='desc'):
14 | """
15 | 根据权重和负载计算服务器得分
16 | :return:
17 | """
18 |
19 | query = self.self_session.filter(self.model.enabled.is_(True), self.model.status.is_(True)).all()
20 | results = [{
21 | 'server_id': each_obj.id,
22 | 'server_name': each_obj.server_name,
23 | 'server_address': each_obj.server_address,
24 | 'score': int((1 - float(each_obj.load)) * each_obj.weight * 10)
25 | } for each_obj in query]
26 | return sorted(results, key=lambda x: x['score'], reverse=True if sort == 'desc' else False)
27 |
28 | def add_server(self, address):
29 | """新增爬虫服务器节点"""
30 | obj = Server(server_name=address, server_address=address)
31 | self.db_session.add(obj)
32 | self.safe_commit()
33 | return
34 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/subtask.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import SubTask
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class SubTaskModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = SubTask
11 |
12 | def create(self, schedule_task_id, server_id):
13 | """
14 | 创建子任务
15 | :param schedule_task_id:
16 | :param server_id:
17 | :return:
18 | """
19 |
20 | obj = SubTask(schedule_task_id=schedule_task_id, server_id=server_id)
21 | self.db_session.add(obj)
22 | self.safe_commit()
23 | return obj
24 |
25 | def query_delivery_failure_count(self, schedule_task_id):
26 | """
27 | 查询下发失败的子任务
28 | :return:
29 | """
30 | return self.self_session.filter(
31 | self.model.schedule_task_id == schedule_task_id,
32 | self.model.delivery_failure_msg.isnot(None)
33 | ).count()
34 |
35 | def query_unfinished_subtask_count(self, schedule_task_id):
36 | """
37 | 根据子任务id查询当前调度任务未完成的子任务数量
38 | :param schedule_task_id:
39 | :return:
40 | """
41 | return self.self_session.filter(
42 | self.model.schedule_task_id == schedule_task_id, self.model.finished.is_(False)
43 | ).count()
44 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import Task, SubTask, TaskUrl, Result, ScheduleTaskRecord
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class TaskModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = Task
11 |
12 | def create(self,
13 | customer_id=None,
14 | task_name=None,
15 | extra_data=None,
16 | task_status='executing',
17 | crawl_options={},
18 | **kwargs):
19 | """
20 | :return:
21 | """
22 | obj = Task(
23 | customer_id=customer_id, task_name=task_name,
24 | task_status=task_status, crawl_options=crawl_options, extra_data=extra_data,
25 | schedule_options={'schedule_type': kwargs['schedule_type'], 'schedule_data': kwargs['schedule_data']})
26 | self.db_session.add(obj)
27 | self.db_session.flush()
28 | self.safe_commit()
29 |
30 | return obj
31 |
32 | def query_task_obj_by_subtask(self, subtask_id):
33 | """
34 | 通过子任务获取主任务模型对象
35 | :param subtask_id:
36 | :return:
37 | """
38 |
39 | task_obj = self.db_session.query(self.model).select_from(self.model) \
40 | .join(ScheduleTaskRecord, ScheduleTaskRecord.task_id == self.model.id) \
41 | .join(SubTask, SubTask.schedule_task_id == ScheduleTaskRecord.id) \
42 | .filter(SubTask.id == subtask_id) \
43 | .first()
44 |
45 | return task_obj
46 |
47 | def query_url_count(self, task_id):
48 | """
49 | 获取url总数
50 | :param task_id:
51 | :return:
52 | """
53 |
54 | return self.db_session.query(TaskUrl).filter(TaskUrl.task_id == task_id).count()
55 |
56 | def query_crawl_url_count(self, task_id):
57 | """
58 | 获取已爬取的url总数
59 | :param task_id:
60 | :return:
61 | """
62 |
63 | return self.db_session.query(Result) \
64 | .join(SubTask, Result.subtask_id == SubTask.id) \
65 | .join(ScheduleTaskRecord, ScheduleTaskRecord.id == SubTask.schedule_task_id) \
66 | .filter(ScheduleTaskRecord.task_id == task_id).count()
67 |
68 | def add_schedule_record(self, task_id, schedule_task_status, crawl_options):
69 | """
70 | 增加调度记录
71 | :param task_id:
72 | :param schedule_task_status:
73 | :param crawl_options:
74 | :return:
75 | """
76 | obj = ScheduleTaskRecord(
77 | task_id=task_id,
78 | crawl_options=crawl_options,
79 | schedule_task_status=schedule_task_status
80 | )
81 | self.db_session.add(obj)
82 | self.safe_commit()
83 | return obj
84 |
85 | def query_task_loop_count(self, task_id):
86 | """
87 | 获取任务已跑轮次
88 | :param task_id:
89 | :return:
90 | """
91 |
92 | return self.db_session.query(ScheduleTaskRecord).filter(ScheduleTaskRecord.task_id == task_id).count()
93 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/task_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import TaskUrl, Url
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class TaskUrlModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = TaskUrl
11 |
12 | def create(self, task_id, urls_id):
13 | """
14 | :return:
15 | """
16 | self.db_session.add_all(
17 | [TaskUrl(task_id=task_id, url_id=url_id) for url_id in urls_id])
18 | self.safe_commit()
19 |
20 | def create_subtask_url_mapping(self, chunk_url, subtask_id):
21 | """
22 | 创建子任务与url映射关系
23 | :param chunk_url:
24 | :param subtask_id:
25 | :return:
26 | """
27 | urls_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(chunk_url)).all()
28 | self.self_session.filter(self.model.url_id.in_([each[0] for each in urls_query])).update(
29 | {self.model.sub_task_id: subtask_id}, synchronize_session='fetch')
30 | self.safe_commit()
31 | return [{'url_id': each[0], 'url_address': each[1]} for each in urls_query]
32 |
33 | def query_urls_by_task_id(self, task_id):
34 | """
35 | 根据task id查询关联的url
36 | :param task_id:
37 | :return:
38 | """
39 |
40 | query = self.db_session.query(self.model.url_id, Url.address) \
41 | .join(Url, Url.id == self.model.url_id) \
42 | .filter(self.model.task_id == task_id) \
43 | .all()
44 | return [{
45 | 'url_id': each_obj[0], 'url_address': each_obj[1]}
46 | for each_obj in query
47 | ]
48 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import Url
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class UrlModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = Url
11 |
12 | def create(self, urls):
13 | """
14 | :return:
15 | """
16 |
17 | # 检测系统中已存在的url
18 | exist_url_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(urls)).all()
19 | exist_urls_id = [each[0] for each in exist_url_query]
20 |
21 | # 创建在系统中不存在的url
22 | not_create_urls = set(urls).difference(set([each[1] for each in exist_url_query]))
23 | create_url_models = [Url(address=url) for url in not_create_urls]
24 | self.db_session.add_all(create_url_models)
25 | self.safe_commit()
26 |
27 | exist_urls_id.extend([each.id for each in create_url_models])
28 | return exist_urls_id
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 存储结果
5 | """
6 |
7 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text
8 | from sqlalchemy.dialects.postgresql import JSONB
9 |
10 | from webs.api.models import db
11 |
12 |
13 | class Result(db.Model):
14 | __tablename__ = 'results'
15 |
16 | id = Column(BigInteger, primary_key=True, autoincrement=True)
17 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id
18 | url_id = Column(Integer, nullable=False, index=True) # url id
19 | url_address = Column(String(1024), nullable=False) # url 地址
20 | http_code = Column(Integer) # 网站状态码
21 | title = Column(Text) # 网站标题
22 | content = Column(Text) # 网站内容
23 | text = Column(Text) # 网页正文
24 | current_url = Column(String(1024)) # 网站最后相应的地址
25 | redirect_chain = Column(JSONB) # 重定向链接
26 | response_headers = Column(JSONB) # response headers
27 | har_uuid = Column(String(128)) # 网站交互过程
28 | screenshot_id = Column(String(128)) # 截图Id
29 | cookies = Column(JSONB) # cookies
30 | finished_at = Column(TIMESTAMP) # 完成时间
31 | wappalyzer_results = Column(JSONB) # 网站指纹
32 | callback_failure_msg = Column(Text) # 回调错误信息
33 | favicon_md5 = Column(String(50)) # 网站图标hash值
34 | favicon_link = Column(String(1024)) # 网站图标链接
35 | response_time = Column(Integer) # 网站响应时间
36 | load_complete_time = Column(Integer) # 页面加载完成时间
37 | charset = Column(String(256)) # 网站编码
38 |
39 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
40 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
41 |
42 | def __repr__(self):
43 | return f''
44 |
45 | def as_dict(self):
46 | from webs.api.models.db_proxy import task_model_proxy
47 | task_obj = task_model_proxy.query_task_obj_by_subtask(self.subtask_id)
48 |
49 | return {
50 | 'result_id': self.id,
51 | 'subtask_id': self.subtask_id,
52 | 'task_id': task_obj.id if task_obj else None,
53 | 'customer_id': task_obj.customer_id if task_obj else None,
54 | 'url_id': self.url_id,
55 | 'url_address': self.url_address,
56 | 'http_code': self.http_code,
57 | 'title': self.title,
58 | 'content': self.content,
59 | 'text': self.text,
60 | 'current_url': self.current_url,
61 | 'redirect_chain': self.redirect_chain,
62 | 'response_headers': self.response_headers,
63 | 'har_uuid': self.har_uuid,
64 | 'screenshot_id': self.screenshot_id,
65 | 'cookies': self.cookies,
66 | 'favicon_md5': self.favicon_md5,
67 | 'favicon_link': self.favicon_link,
68 | 'wappalyzer_results': self.wappalyzer_results,
69 | 'response_time': self.response_time,
70 | 'load_complete_time': self.load_complete_time,
71 | 'charset': self.charset,
72 | 'finished_at': self.finished_at.strftime("%Y-%m-%d %H:%M:%S")
73 | }
74 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | 爬虫节点
6 | """
7 |
8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Boolean, Integer, Float
9 |
10 | from webs.api.models import db
11 |
12 |
13 | class Server(db.Model):
14 | __tablename__ = 'servers'
15 |
16 | id = Column(BigInteger, primary_key=True, autoincrement=True)
17 | server_name = Column(String(128)) # 爬虫节点服务器名称
18 | server_address = Column(String(255), unique=True, nullable=True) # 服务器地址 ip:port
19 | enabled = Column(Boolean, server_default='t') # 是否启用 默认是
20 | status = Column(Boolean, server_default='t') # 服务器状态是否正常 默认是
21 | weight = Column(Integer, server_default='3') # 服务器权重 1,2,3,4,5 默认为3
22 | # load = Column(Integer, server_default='0') # 服务器负载,子服务器定时向主节点发送
23 | load = Column(String(20), server_default='0.1') # 服务器负载,子服务器定时向主节点发送
24 | spider_type = Column(String(20), server_default='splash') # 爬虫节点类型 splash/pyppeteer
25 |
26 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
27 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
28 |
29 | def __repr__(self):
30 | return f''
31 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | 动态爬虫扫描任务模型
6 | """
7 | from datetime import datetime
8 |
9 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY, Boolean, Text
10 | from sqlalchemy.dialects.postgresql import JSONB
11 |
12 | from webs.api.models import db
13 |
14 |
15 | class Task(db.Model):
16 | __tablename__ = 'tasks'
17 |
18 | id = Column(BigInteger, primary_key=True, autoincrement=True)
19 | customer_id = Column(String(255), index=True) # 纯粹用来作为API调用标识,API 返回时被原样返回,以方便 API 调用方匹配请求与返回。
20 | task_name = Column(String(255)) # 任务名称
21 | task_status = Column(String(255)) # task任务状态
22 | finished = Column(Boolean, server_default='f') # 任务是否已完成
23 | schedule_options = Column(JSONB) # 周期调度相关参数
24 | crawl_options = Column(JSONB) # 爬取相关参数
25 | extra_data = Column(Text) # 客户端额外数据
26 |
27 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
28 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
29 |
30 | def __repr__(self):
31 | return f''
32 |
33 | def as_dict(self, **kwargs):
34 | return {
35 | 'task_id': self.id,
36 | 'customer_id': self.customer_id,
37 | 'task_name': self.task_name,
38 | 'task_status': self.task_status,
39 | 'finished': self.finished,
40 | 'crawl_options': self.crawl_options,
41 | 'schedule_options': self.schedule_options,
42 | 'extra_data': self.extra_data,
43 | 'create_time': self.create_time.strftime("%Y-%m-%d %H:%M:%S"),
44 | 'update_time': self.update_time.strftime("%Y-%m-%d %H:%M:%S")
45 | } if not kwargs.get('fields') else {
46 | f: getattr(self, f, None) if not isinstance(getattr(self, f), datetime)
47 | else getattr(self, f).strftime("%Y-%m-%d %H:%M:%S")
48 | for f in kwargs['fields'] if f in self.__table__.columns
49 | }
50 |
51 |
52 | class SubTask(db.Model):
53 | __tablename__ = 'sub_tasks'
54 |
55 | id = Column(BigInteger, primary_key=True, autoincrement=True)
56 | schedule_task_id = Column(Integer, nullable=False, index=True) # 所属某次调度任务id
57 | server_id = Column(Integer, nullable=False) # 此子任务关联的服务器节点
58 | assigned_urls = Column(ARRAY(String)) # 此子任务所分配的url
59 | delivery_failure_msg = Column(Text) # 发送失败原因
60 | finished = Column(Boolean, server_default='f') # 是否已完成
61 | finished_at = Column(TIMESTAMP) # 完成时间
62 |
63 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
64 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
65 |
66 | def __repr__(self):
67 | return f''
68 |
69 |
70 | class ScheduleTaskRecord(db.Model):
71 | __tablename__ = 'schedule_task_records'
72 |
73 | id = Column(BigInteger, primary_key=True, autoincrement=True)
74 | task_id = Column(Integer, nullable=False, index=True) # 所属任务id
75 | schedule_task_status = Column(String(255)) # task任务状态
76 | finished = Column(Boolean, server_default='f') # 此次任务是否已完成
77 | crawl_options = Column(JSONB) # 此次任务所使用的爬取参数
78 |
79 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) # 调度任务创建时间
80 | start_time = Column(TIMESTAMP) # 此次调度任务真正开始执行时间
81 | finished_time = Column(TIMESTAMP) # 任务完成时间
82 |
83 | def __repr__(self):
84 | return f''
85 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/task_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 任务与url映射关系
5 | """
6 |
7 | from sqlalchemy import Column, BigInteger, TIMESTAMP, func
8 |
9 | from webs.api.models import db
10 |
11 |
12 | class TaskUrl(db.Model):
13 | __tablename__ = 'task_url'
14 |
15 | id = Column(BigInteger, primary_key=True, autoincrement=True)
16 | task_id = Column(BigInteger, nullable=True, index=True)
17 | # sub_task_id = Column(BigInteger, index=True) # 子任务id
18 | url_id = Column(BigInteger, nullable=True, index=True)
19 |
20 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
21 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
22 |
23 | def __repr__(self):
24 | return f''
25 |
--------------------------------------------------------------------------------
/services/engine/webs/api/models/url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | Url模型
6 | """
7 |
8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func
9 |
10 | from webs.api.models import db
11 |
12 |
13 | class Url(db.Model):
14 | __tablename__ = 'urls'
15 |
16 | id = Column(BigInteger, primary_key=True, autoincrement=True)
17 | address = Column(String(1024), unique=True, index=True)
18 |
19 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
20 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
21 |
22 | def __repr__(self):
23 | return f''
24 |
--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | from datetime import datetime
4 |
5 | from webs.api.models.db_proxy import task_model_proxy
6 | from webs.api.exceptions.customs import InvalidAPIRequest, RecordNotFound
7 | from webs.api.utils.helper import nowstr, today
8 |
9 |
10 | class LengthChecker(object):
11 | """字段长度校验"""
12 |
13 | def __init__(self, sign, length):
14 | self.sign = sign
15 | self.length = length
16 |
17 | def __call__(self, verified):
18 | if verified is not None and len(verified) > self.length:
19 | raise InvalidAPIRequest(f'{self.sign}长度过长!')
20 |
21 |
22 | class OneOf(object):
23 | """Validator which succeeds if ``value`` is a member of ``choices``"""
24 |
25 | def __init__(self, choices):
26 | self.choices = choices
27 |
28 | def __call__(self, verified):
29 | if verified not in self.choices:
30 | raise InvalidAPIRequest(f'请选择{self.choices}其中之一!')
31 |
32 |
33 | class TaskValidator(object):
34 | """主任务验证器"""
35 |
36 | def __init__(self):
37 | self.url_pattern = r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?'
38 |
39 | def url_valid(self, url):
40 | if not re.match(self.url_pattern, url):
41 | raise InvalidAPIRequest(f'{url}不是一个合法的Url!')
42 |
43 | def task_id_exist_valid(self, task_id):
44 | if not task_model_proxy.find(id=task_id):
45 | raise RecordNotFound('该任务不存在!')
46 |
47 | @staticmethod
48 | def schedule_valid(kwargs):
49 | """验证周期性调度相关参数"""
50 | schedule_type, schedule_data = kwargs.get('schedule_type'), kwargs.get('schedule_data')
51 |
52 | # 验证定时执行
53 | if schedule_type == 'datetime':
54 | if len(schedule_data) > 1:
55 | raise InvalidAPIRequest('无效的执行时间!')
56 | run_date = schedule_data.get('run_date')
57 | if not run_date:
58 | raise InvalidAPIRequest('无效的执行时间!')
59 | # 和当前时间比较
60 | if run_date <= nowstr():
61 | raise InvalidAPIRequest('执行时间不能小于当前时间!')
62 |
63 | # 验证间隔执行和周期调度
64 | elif schedule_type in ('interval', 'cron'):
65 | if not schedule_data.get('start_date') or not schedule_data.get('end_date'):
66 | raise InvalidAPIRequest('请输入开始时间和结束时间!')
67 |
68 | interval_effective_params = {
69 | 'weeks', 'days', 'hours', 'minutes', 'seconds',
70 | 'start_date', 'end_date', 'max_instances'
71 | }
72 | cron_effective_params = {
73 | 'week', 'day', 'hour', 'minute',
74 | 'second', 'year', 'month',
75 | 'day_of_week', 'max_instances',
76 | 'start_date', 'end_date'
77 | }
78 |
79 | if (schedule_type == 'cron' and set(schedule_data.keys()).difference(cron_effective_params)) or (
80 | schedule_type == 'interval' and set(schedule_data.keys()).difference(interval_effective_params)):
81 | raise InvalidAPIRequest('无效的调度参数!')
82 |
83 | if not set(schedule_data.keys()).difference({'start_date', 'end_date'}):
84 | raise InvalidAPIRequest('请输入正确的调度参数!')
85 |
86 | if schedule_data.get('start_date') >= schedule_data.get('end_date'):
87 | raise InvalidAPIRequest('开始时间不能大于结束时间!')
88 |
89 | if schedule_data.get('end_date') < today():
90 | raise InvalidAPIRequest('结束时间不能小于当前时间!')
91 |
92 |
93 | class TimeValidator(object):
94 |
95 | def __init__(self, s=None, e=None):
96 | self.s = s
97 | self.e = e
98 |
99 | @staticmethod
100 | def date_or_datetime_valid(_time):
101 | try:
102 | datetime.strptime(_time, "%Y-%m-%d")
103 | return
104 | except (ValueError, AttributeError) as e:
105 | pass
106 | try:
107 | datetime.strptime(_time, "%Y-%m-%d %H:%M:%S")
108 | return
109 | except (ValueError, AttributeError) as e:
110 | pass
111 | raise InvalidAPIRequest('请输入正确的日期时间!')
112 |
113 | def __call__(self, time_field):
114 | if not self.s <= time_field <= self.e:
115 | raise InvalidAPIRequest('请输入正确的时间范围!')
116 |
117 |
118 | task_validator = TaskValidator()
119 | time_validator = TimeValidator()
120 |
--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/results.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from webargs import fields
3 |
4 | result_by_url_schema = {
5 | "url": fields.Url(required=True),
6 | "fields": fields.DelimitedList(fields.Str(), missing=[])
7 | }
8 |
9 | result_by_id_schema = {
10 | "result_id": fields.Int(required=True)
11 | }
12 |
13 | get_screenshot_schema = {
14 | 'screenshot_id': fields.Str(required=True)
15 | }
16 |
17 | download_har_file_schema = {
18 | 'har_uuid': fields.Str(required=True)
19 | }
20 |
21 | get_favicon_schema = {
22 | 'favicon_md5': fields.Str(required=True)
23 | }
24 |
25 | get_small_schema = {
26 | **get_screenshot_schema,
27 | 'wide': fields.Int(missing=272),
28 | 'high': fields.Int(missing=165)
29 | }
30 |
--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/tasks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 |
4 | from webargs import fields
5 |
6 | from webs.api.schemas import LengthChecker, OneOf, task_validator, TimeValidator as TimeRangeValid, time_validator
7 |
8 | schedule_data = {
9 | 'run_date': fields.Str(validate=time_validator.date_or_datetime_valid), # 定时执行时间
10 | 'year': fields.Int(validate=TimeRangeValid(2021, 2999)),
11 | 'month': fields.Int(validate=TimeRangeValid(1, 12)),
12 | 'day': fields.Int(validate=TimeRangeValid(1, 31)),
13 | 'week': fields.Int(validate=TimeRangeValid(1, 53)),
14 | 'day_of_week': fields.Int(validate=TimeRangeValid(0, 6)),
15 | 'hour': fields.Int(validate=TimeRangeValid(0, 23)),
16 | 'minute': fields.Int(validate=TimeRangeValid(0, 59)),
17 | 'second': fields.Int(validate=TimeRangeValid(0, 59)),
18 | 'weeks': fields.Int(),
19 | 'days': fields.Int(),
20 | 'hours': fields.Int(),
21 | 'minutes': fields.Int(),
22 | 'seconds': fields.Int(),
23 | 'start_date': fields.Str(validate=time_validator.date_or_datetime_valid),
24 | 'end_date': fields.Str(validate=time_validator.date_or_datetime_valid),
25 | 'max_instances': fields.Int(missing=1)
26 | }
27 |
28 | crawl_options = {
29 | 'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])),
30 | 'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])), # 任务优先级
31 | 'headless': fields.Bool(missing=False), # 有头/无头模式 默认使用有头模式
32 | 'debug': fields.Bool(missing=False), # 是否开启调试模式,
33 | 'referer': fields.Str(), # 网站来路地址
34 | 'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])), # 并发数
35 | 'url_timeout': fields.Int(missing=30), # 单个url超时时间
36 | 'enabled_render_js': fields.Bool(missing=True),
37 | 'page_wait_time': fields.Int(missing=3), # 等待页面js渲染时间
38 | 'ignore_ssl': fields.Bool(missing=True), # 是否忽略证书错误
39 | 'screenshot': fields.Bool(missing=False), # 是否截图
40 | 'proxy_url': fields.Str(), # 代理
41 | 'user_agent': fields.Str(), # Ua
42 | 'record_har': fields.Bool(missing=False), # 请求networks
43 | 'record_redirect': fields.Bool(missing=False), # 是否记录重定向链接
44 | 'use_browser_cache': fields.Bool(missing=True), # 是否使用浏览器缓存
45 | 'use_result_cache': fields.Bool(missing=True), # 是否使用结果缓存
46 | 'wappalyzer': fields.Bool(missing=False), # 是否使用指纹识别
47 | 'extract_text': fields.Bool(missing=True), # 是否提取网页正文
48 | 'extract_favicon': fields.Bool(missing=False), # 是否下载网站图标
49 | 'callback_type': fields.Str(validate=OneOf(choices=['http', 'rpc'])),
50 | 'callback_address': fields.Str(),
51 | 'wait_until': fields.Str(
52 | missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])), # 控制页面何时加载成功,
53 | 'rpc_server': fields.Str(missing=os.getenv('LOCAL_RPC_SERVER_ADDRESS'))
54 | }
55 |
56 | create_task_schema = {
57 | 'customer_id': fields.Str(validate=LengthChecker(sign='自定义id', length=255)),
58 | 'task_name': fields.Str(validate=LengthChecker(sign='任务名称', length=255)),
59 | 'urls': fields.DelimitedList(fields.Str(validate=task_validator.url_valid), required=True),
60 | 'schedule_type': fields.Str(missing='instantly', validate=OneOf(['instantly', 'datetime', 'interval', 'cron'])),
61 | 'schedule_data': fields.Nested(schedule_data, missing={}),
62 | 'crawl_options': fields.Nested(crawl_options, missing={}),
63 | 'extra_data': fields.Str(),
64 | }
65 |
66 | task_id_schema = {
67 | 'task_id': fields.Int(required=True, validate=task_validator.task_id_exist_valid)
68 | }
69 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/helper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from datetime import datetime
4 |
5 |
6 | def now():
7 | return datetime.now()
8 |
9 |
10 | def nowstr():
11 | return now().strftime('%Y-%m-%d %H:%M:%S')
12 |
13 |
14 | def today():
15 | return now().strftime('%Y-%m-%d')
16 |
17 |
18 | def add_spider_server(address):
19 | """添加爬虫服务地址"""
20 | from webs.api.models.db_proxy import server_model_proxy
21 | server_model_proxy.add_server(address)
22 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/loggers.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import logging
5 | import socket
6 | import sys
7 | import traceback
8 | from datetime import datetime
9 |
10 | try:
11 | import simplejson as json
12 | except ImportError:
13 | import json
14 |
15 |
16 | class JSONFormatter(logging.Formatter):
17 | """
18 | JSON formatter for python logging
19 |
20 | You can pass additional tags on a per message basis using the
21 | key "tags" in the extra parameter.
22 | eg: logger.error('hello world!', extra={"tags": ["hello=world"]})
23 | """
24 |
25 | def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON',
26 | indent=None):
27 | """
28 | :param tags: a list of tags to add to every messages
29 | :hostname: force a specific hostname
30 | :fqdn: a boolean to use the FQDN instead of the machine's hostname
31 | :message_type: the message type for Logstash formatters
32 | :indent: indent level of the JSON output
33 | """
34 | self.message_type = message_type
35 | self.tags = tags if tags is not None else []
36 | self.extra_tags = []
37 | self.indent = indent
38 |
39 | if hostname:
40 | self.host = hostname
41 | elif fqdn:
42 | self.host = socket.getfqdn()
43 | else:
44 | self.host = socket.gethostname()
45 |
46 | def get_extra_fields(self, record):
47 | # The list contains all the attributes listed in
48 | # http://docs.python.org/library/logging.html#logrecord-attributes
49 | skip_list = [
50 | 'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args',
51 | 'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg',
52 | 'msecs', 'msecs', 'message', 'name', 'pathname', 'process',
53 | 'processName', 'relativeCreated', 'thread', 'threadName', 'extra']
54 |
55 | if sys.version_info < (3, 0):
56 | easy_types = (str, bool, dict, float, int, list, type(None))
57 | else:
58 | easy_types = (str, bool, dict, float, int, list, type(None))
59 |
60 | fields = {}
61 |
62 | self.extra_tags = []
63 | for key, value in record.__dict__.items():
64 | if key not in skip_list:
65 | if key == 'tags' and isinstance(value, list):
66 | self.extra_tags = value
67 | elif isinstance(value, easy_types):
68 | fields[key] = value if value else "null"
69 | else:
70 | fields[key] = repr(value)
71 |
72 | return fields
73 |
74 | def get_debug_fields(self, record):
75 | if record.exc_info:
76 | exc_info = self.format_exception(record.exc_info)
77 | else:
78 | exc_info = record.exc_text
79 | return {
80 | 'exc_info': exc_info,
81 | 'filename': record.filename,
82 | 'lineno': record.lineno,
83 | }
84 |
85 | @classmethod
86 | def format_source(cls, message_type, host, path):
87 | return "%s://%s/%s" % (message_type, host, path)
88 |
89 | @classmethod
90 | def format_timestamp(cls, time):
91 | return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X"))
92 |
93 | @classmethod
94 | def format_exception(cls, exc_info):
95 | return ''.join(traceback.format_exception(*exc_info)) if exc_info else ''
96 |
97 | @classmethod
98 | def serialize(cls, message, indent=None):
99 | return json.dumps(message, ensure_ascii=False, indent=indent)
100 |
101 | def format(self, record, serialize=True):
102 | old_message = record.getMessage()
103 | try:
104 | new_message = json.loads(old_message)
105 | except json.decoder.JSONDecodeError as e:
106 | message = old_message.replace("'", '"')
107 | new_message = json.loads(message)
108 | except Exception:
109 | new_message = record.getMessage()
110 | # Create message dict
111 | message = {
112 | 'timestamp': self.format_timestamp(record.created),
113 | 'app': os.environ.get('APP_NAME'),
114 | 'host': self.host,
115 | 'environment': os.environ.get('FLASK_ENV'),
116 | 'logger': record.name,
117 | 'level': record.levelname,
118 | 'messages': new_message,
119 | 'path': record.pathname,
120 | 'tags': self.tags[:]
121 | }
122 |
123 | # Add extra fields
124 | message.update(self.get_extra_fields(record))
125 |
126 | # Add extra tags
127 | if self.extra_tags:
128 | message['tags'].extend(self.extra_tags)
129 |
130 | # If exception, add debug info
131 | if record.exc_info or record.exc_text:
132 | message.update(self.get_debug_fields(record))
133 |
134 | if serialize:
135 | return self.serialize(message, indent=self.indent)
136 | return message
137 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/requests.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from datetime import datetime
4 | from flask import current_app, request
5 | from sqlalchemy.exc import DatabaseError
6 | from webs.api.exceptions.customs import InvalidContentType
7 | from webs.api.models import db
8 |
9 | ACL_ORIGIN = 'Access-Control-Allow-Origin'
10 | ACL_METHODS = 'Access-Control-Allow-Methods'
11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers'
12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials'
13 | ACL_CACHE_CONTROL = 'Cache-Control'
14 |
15 | GET_METHOD = 'GET'
16 | OPTIONS_METHOD = 'OPTIONS'
17 | ALLOWED_ORIGINS = '*'
18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \
20 | 'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type'
21 | ALLOWED_CREDENTIALS = 'true' # Allow send cookie
22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate'
23 |
24 |
25 | def before_request_middleware(app):
26 | app.before_request_funcs.setdefault(None, [
27 | ensure_request_log,
28 | ensure_content_type,
29 | ])
30 |
31 |
32 | def after_request_middleware(app):
33 | app.after_request_funcs.setdefault(None, [
34 | enable_cors,
35 | commit_session,
36 | ])
37 |
38 |
39 | def teardown_appcontext_middleware(app):
40 | app.teardown_appcontext_funcs = [
41 | shutdown_session,
42 | ]
43 |
44 |
45 | def ensure_request_log():
46 | """当为生产环境时,屏蔽中间件日志记录器"""
47 | if current_app.debug:
48 | current_app.logger.info(
49 | "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || "
50 | "Parameters: {param}".format(
51 | time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
52 | client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr),
53 | path=request.full_path,
54 | param=request.data.decode('utf-8')))
55 |
56 |
57 | def ensure_content_type():
58 | """
59 | Ensures that the Content-Type for all requests
60 | is `application-json` or `multipart/form-data`, otherwise appropriate error
61 | is raised.
62 | :raises: InvalidContentType if Content-Type is not `application-json`
63 | or not `multipart/form-data`
64 | """
65 |
66 | content_type = request.headers.get('Content-Type')
67 | if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \
68 | (not content_type or not ('application/json' in content_type or
69 | 'multipart/form-data' in content_type)):
70 | raise InvalidContentType(
71 | message='Invalid Content-Type. '
72 | 'Only `application/json` or `multipart/form-data` is allowed')
73 |
74 |
75 | def enable_cors(response):
76 | """
77 | Enable Cross-origin resource sharing.
78 | These headers are needed for the clients that
79 | will consume the API via AJAX requests.
80 | """
81 | if request.method == OPTIONS_METHOD:
82 | response = current_app.make_default_options_response()
83 | response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS
84 | response.headers[ACL_METHODS] = ALLOWED_METHODS
85 | response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS
86 | response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL
87 |
88 | return response
89 |
90 |
91 | def commit_session(response):
92 | """
93 | Try to commit the db session in the case
94 | of a successful request with status_code
95 | under 400.
96 | """
97 | if response.status_code >= 400:
98 | return response
99 | try:
100 | db.session.commit()
101 | except DatabaseError:
102 | db.session.rollback()
103 | return response
104 |
105 |
106 | def shutdown_session(exception=None):
107 | """
108 | Remove the db session and detach from the
109 | database driver after application shutdown.
110 | """
111 | db.session.remove()
112 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/routers.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pkgutil
4 |
5 |
6 | def register_routes(app):
7 | """Register routes."""
8 | from .. import views
9 | from flask.blueprints import Blueprint
10 |
11 | for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."):
12 | blueprint_name = name.split('.')[-1]
13 | modules = __import__(name, fromlist="dummy")
14 | blueprint = getattr(modules, blueprint_name)
15 | if isinstance(blueprint, Blueprint):
16 | app.register_blueprint(blueprint)
17 |
--------------------------------------------------------------------------------
/services/engine/webs/api/utils/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask_migrate import Migrate
4 |
5 | from webs.api.models import db, redis_store
6 |
7 |
8 | def init_db(app):
9 | """
10 | Create database if doesn't exist and
11 | create all tables.
12 | """
13 |
14 | # 初始化pg
15 | db.init_app(app)
16 | migrate = Migrate(compare_type=True, compare_server_default=True)
17 | migrate.init_app(app, db)
18 |
19 | # 初始化Redis
20 | redis_store.init_app(app)
21 |
22 | return db
23 |
--------------------------------------------------------------------------------
/services/engine/webs/api/views/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/api/views/ping.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask import Blueprint, jsonify
4 |
5 | ping = Blueprint('ping', __name__)
6 |
7 |
8 | @ping.route('/ping')
9 | def ping_pong():
10 | """
11 | 测试服务是否可用
12 | """
13 | return jsonify({
14 | "data": "pong",
15 | "status": True
16 | })
17 |
--------------------------------------------------------------------------------
/services/engine/webs/api/views/results.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask import Blueprint, jsonify
4 | from webargs.flaskparser import use_args
5 |
6 | from webs.api.bizs.result import ResultBiz
7 | from webs.api.schemas.results import result_by_url_schema, result_by_id_schema, get_screenshot_schema, \
8 | download_har_file_schema, get_favicon_schema, get_small_schema
9 |
10 | results = Blueprint('results', __name__, url_prefix='/results')
11 |
12 |
13 | @results.route('/get-by-url')
14 | @use_args(result_by_url_schema, locations=('query',))
15 | def result_by_url(args):
16 | """
17 | 根据url查询结果
18 | :param args:
19 | :return:
20 | """
21 |
22 | result_biz = ResultBiz()
23 | return jsonify({
24 | 'status': True,
25 | 'data': result_biz.result_by_url(args['url'], args['fields'])
26 | }), 200
27 |
28 |
29 | @results.route('/get-by-id')
30 | @use_args(result_by_id_schema, locations=('query',))
31 | def result_by_id(args):
32 | """
33 | 根据id查询结果
34 | :param args:
35 | :return:
36 | """
37 |
38 | result_biz = ResultBiz()
39 | return jsonify({
40 | 'status': True,
41 | 'data': result_biz.result_by_id(args['result_id'])
42 | }), 200
43 |
44 |
45 | @results.route('/screenshot')
46 | @use_args(get_screenshot_schema, locations=('query',))
47 | def get_screenshot(args):
48 | """
49 | 获取图片
50 | :param args:
51 | :return:
52 | """
53 |
54 | result_biz = ResultBiz()
55 | return result_biz.get_screenshot(args['screenshot_id'])
56 |
57 |
58 | @results.route('/screenshot/encode')
59 | @use_args(get_screenshot_schema, locations=('query',))
60 | def get_screenshot_base64(args):
61 | """
62 | 获取图片base64编码
63 | :param args:
64 | :return:
65 | """
66 |
67 | result_biz = ResultBiz()
68 | return jsonify({
69 | 'status': True,
70 | 'data': result_biz.get_screenshot_base64_encode(args['screenshot_id'])
71 | }), 200
72 |
73 |
74 | @results.route('/screenshot/download')
75 | @use_args(get_screenshot_schema, locations=('query',))
76 | def download_screenshot(args):
77 | """
78 | 下载图片
79 | :param args:
80 | :return:
81 | """
82 | result_biz = ResultBiz()
83 | return result_biz.get_screenshot(args['screenshot_id'], download=True)
84 |
85 |
86 | @results.route('/screenshot/small')
87 | @use_args(get_small_schema, locations=('query',))
88 | def small_screenshot(args):
89 | """
90 | 查看图片缩略图
91 | :param args:
92 | :return:
93 | """
94 | result_biz = ResultBiz()
95 | return result_biz.get_small_screenshot(**args)
96 |
97 |
98 | @results.route('/har/download')
99 | @use_args(download_har_file_schema, locations=('query',))
100 | def download_har_file(args):
101 | """
102 | 下载har文件
103 | :param args:
104 | :return:
105 | """
106 |
107 | result_biz = ResultBiz()
108 | return result_biz.download_har(args['har_uuid'])
109 |
110 |
111 | @results.route('/favicon')
112 | @use_args(get_favicon_schema, locations=('query',))
113 | def get_favicon(args):
114 | """
115 | 查看网站图标
116 | :param args:
117 | :return:
118 | """
119 | result_biz = ResultBiz()
120 | return result_biz.get_favicon(args['favicon_md5'])
121 |
122 |
123 | @results.route('/favicon/download')
124 | @use_args(get_favicon_schema, locations=('query',))
125 | def download_favicon(args):
126 | """
127 | 下载网站图标
128 | :param args:
129 | :return:
130 | """
131 | result_biz = ResultBiz()
132 | return result_biz.get_favicon(args['favicon_md5'], download=True)
133 |
--------------------------------------------------------------------------------
/services/engine/webs/api/views/tasks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask import Blueprint, jsonify
4 | from webargs.flaskparser import use_args
5 |
6 | from webs.api.bizs.task import TaskBiz
7 | from webs.api.schemas import task_validator
8 | from webs.api.schemas.tasks import create_task_schema, task_id_schema
9 |
10 | tasks = Blueprint('tasks', __name__, url_prefix='/tasks')
11 |
12 |
13 | @tasks.route('', methods=['POST'])
14 | @use_args(create_task_schema, locations=('json',), validate=task_validator.schedule_valid)
15 | def create_task(args):
16 | """
17 | 创建爬虫任务
18 | :param args:
19 | :return:
20 | """
21 |
22 | task_biz = TaskBiz()
23 | data = task_biz.create_task(**args)
24 |
25 | return jsonify({
26 | 'status': True,
27 | 'data': data
28 | }), 201
29 |
30 |
31 | @tasks.route('', methods=['DELETE'])
32 | @use_args(task_id_schema, locations=('json',))
33 | def delete_task(args):
34 | """
35 | 删除爬虫任务
36 | :param args:
37 | :return:
38 | """
39 |
40 | task_biz = TaskBiz()
41 | task_biz.delete_task(args['task_id'])
42 |
43 | return jsonify({
44 | 'status': True
45 | }), 204
46 |
47 |
48 | @tasks.route('/status')
49 | @use_args(task_id_schema, locations=('query',))
50 | def get_task_status(args):
51 | """
52 | 查询任务进度
53 | :param args:
54 | :return:
55 | """
56 |
57 | task_biz = TaskBiz()
58 | return jsonify({
59 | 'status': True,
60 | 'data': task_biz.get_task_status(**args)
61 | }), 200
62 |
63 |
64 | @tasks.route('/pause', methods=['PATCH'])
65 | @use_args(task_id_schema, locations=('json',))
66 | def pause_task(args):
67 | """
68 | 暂停调度任务
69 | :param args:
70 | :return:
71 | """
72 |
73 | task_biz = TaskBiz()
74 | task_biz.pause_task(args['task_id'])
75 |
76 | return jsonify({
77 | 'status': True,
78 | }), 200
79 |
80 |
81 | @tasks.route('/resume', methods=['PATCH'])
82 | @use_args(task_id_schema, locations=('json',))
83 | def resume_task(args):
84 | """
85 | 恢复调度任务
86 | :param args:
87 | :return:
88 | """
89 |
90 | task_biz = TaskBiz()
91 | task_biz.resume_task(args['task_id'])
92 |
93 | return jsonify({
94 | 'status': True,
95 | }), 200
96 |
97 |
98 | @tasks.route('/redelivery', methods=['POST'])
99 | @use_args(task_id_schema, locations=('json',))
100 | def redelivery(args):
101 | """
102 | 重新下发
103 | """
104 |
105 | task_biz = TaskBiz()
106 | task_biz.redelivery(args['task_id'])
107 | return jsonify({'status': True}), 200
108 |
--------------------------------------------------------------------------------
/services/engine/webs/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import os
5 |
6 |
7 | class BaseConfig:
8 | """Base configuration"""
9 |
10 | # Root path of project
11 | PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12 |
13 | DEBUG = True
14 | TESTING = False
15 | SQLALCHEMY_TRACK_MODIFICATIONS = False
16 | SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True}
17 | SECRET_KEY = os.environ.get('SECRET_KEY')
18 |
19 | # Redis configuration
20 | REDIS_URL = os.environ.get('REDIS_URL')
21 |
22 |
23 | class DevelopmentConfig(BaseConfig):
24 | """Development configuration"""
25 |
26 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
27 |
28 |
29 | class ProductionConfig(BaseConfig):
30 | """Production configuration"""
31 |
32 | DEBUG = False
33 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
34 |
--------------------------------------------------------------------------------
/services/engine/webs/core/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/webs/core/requests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .request import web_client
4 |
--------------------------------------------------------------------------------
/services/engine/webs/core/requests/request.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import requests
5 |
6 | from webs.api.exceptions.customs import ServerError, InvalidAPIRequest, RecordNotFound, RecordAlreadyExists
7 |
8 |
9 | class RequestMixin(object):
10 | CODE_EXCEPTION_MSG = {
11 | 400: InvalidAPIRequest,
12 | 404: RecordNotFound,
13 | 409: RecordAlreadyExists,
14 | 422: InvalidAPIRequest,
15 | 500: ServerError,
16 | }
17 |
18 | def __init__(self):
19 | self.session = requests.Session()
20 |
21 | @property
22 | def _headers(self):
23 | return {
24 | "Content-Type": "application/json",
25 | }
26 |
27 | def request(self, server, method, url, json=None, params=None, timeout=60):
28 | try:
29 | response = self.session.request(
30 | method, url, json=json, params=params,
31 | timeout=timeout, headers=self._headers
32 | )
33 | except requests.exceptions.ConnectTimeout:
34 | raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接超时!")
35 | except requests.exceptions.ConnectionError:
36 | raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接错误!")
37 |
38 | try:
39 | response_data = response.json()
40 | except Exception as e:
41 | raise ServerError(f"{server}服务器参数解析失败!")
42 |
43 | if not (200 <= response.status_code < 300):
44 | exception = self.CODE_EXCEPTION_MSG[response.status_code] \
45 | if response.status_code in self.CODE_EXCEPTION_MSG else self.CODE_EXCEPTION_MSG[400]
46 | raise exception(f"{server} Response:{response_data.get('error').get('message')}")
47 |
48 | return response_data
49 |
50 |
51 | web_client = RequestMixin()
52 |
--------------------------------------------------------------------------------
/services/engine/worker/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | from celery import Celery
5 |
6 | ##################
7 | # Celery配置
8 | from webs import create_app
9 |
10 |
11 | class CeleryConfig(object):
12 | # 任务与劣化为json,从Celery4.0开始,默认序列化器将为json
13 | task_serializer = 'json'
14 |
15 | # 结果序列化为json
16 | result_serializer = 'json'
17 |
18 | # 定时任务过期时间
19 | result_expires = 60 * 60 * 24
20 |
21 | # 允许接收的任务类型
22 | accept_content = ["json"]
23 |
24 | # 每个进程预取任务数
25 | worker_prefetch_multiplier = 1
26 |
27 | # 每个worker执行200个任务就销毁重启
28 | worker_max_tasks_per_child = 200
29 |
30 | # 时区设置
31 | timezone = 'Asia/Shanghai'
32 | enable_utc = True
33 |
34 |
35 | ##################
36 | # 初始化celery worker
37 | def init_celery(app=None, celery_type='usual'):
38 | app = app or create_app()
39 | celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL'))
40 | celery_app.config_from_object(CeleryConfig)
41 |
42 | # 导入相关任务模块
43 | if celery_type == 'usual':
44 | celery_app.conf.update(imports=['worker.engine', 'worker.result'])
45 | elif celery_type == 'beat':
46 | pass
47 | # celery_app.conf.update(
48 | # imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron'])
49 | # celery_app.conf.update(
50 | # CELERYBEAT_SCHEDULE={
51 | # }
52 | # )
53 |
54 | # 在flask上下文中执行
55 | class ContextTask(celery_app.Task):
56 | """Make celery tasks work with Flask app context"""
57 |
58 | def __call__(self, *args, **kwargs):
59 | with app.app_context():
60 | return self.run(*args, **kwargs)
61 |
62 | celery_app.Task = ContextTask
63 | return celery_app
64 |
65 |
66 | celery_app = init_celery()
67 | # beat_app = init_celery(celery_type='beat')
68 |
--------------------------------------------------------------------------------
/services/engine/worker/engine.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.exceptions.customs import RecordNotFound
4 | from webs.api.models.db_proxy import server_model_proxy, task_model_proxy, subtask_model_proxy, url_model_proxy, \
5 | schedule_task_proxy, task_url_model_proxy, result_model_proxy
6 | from worker import celery_app
7 | from worker.library.helper import split_urls, send, WeightedRandomGenerator, remove_files
8 |
9 |
10 | @celery_app.task(name='delivery_task')
11 | def delivery_task(task_id):
12 | """
13 | 下发任务
14 | :param task_id:
15 | :return:
16 | """
17 |
18 | task_obj = task_model_proxy.find(id=task_id)
19 | if not task_obj:
20 | return
21 |
22 | # 增加爬虫任务调度记录
23 | schedule_task_obj = task_model_proxy.add_schedule_record(
24 | task_id=task_id, schedule_task_status="start_delivery", crawl_options=task_obj.crawl_options)
25 | schedule_task_id = schedule_task_obj.id
26 |
27 | # 查询待爬取的url struct列表
28 | urls_struct = task_url_model_proxy.query_urls_by_task_id(task_id)
29 |
30 | # url分块处理
31 | chunk_urls_struct = split_urls(urls_struct)
32 |
33 | # 获取爬虫节点服务器配额
34 | servers_info = server_model_proxy.query_servers_by_score(sort='desc')
35 | if not servers_info:
36 | task_model_proxy.set_many_attr(obj_id=task_id, fields_v={'task_status': 'No server found!', 'finished': True})
37 | raise RecordNotFound('No server found!')
38 |
39 | ###### 投递子任务
40 | # 当url分块数小于服务器节点数时采用轮询算法
41 | if len(chunk_urls_struct) <= len(servers_info):
42 | for index, chunk_url_struct in enumerate(chunk_urls_struct):
43 | send(schedule_task_id, chunk_url_struct, servers_info[index], task_obj.crawl_options)
44 |
45 | # 否则使用加权随机算法
46 | else:
47 | server_seeds = WeightedRandomGenerator(servers_info)
48 | for chunk_url_struct in chunk_urls_struct:
49 | send(schedule_task_id, chunk_url_struct, server_seeds.spawn(), task_obj.crawl_options)
50 |
51 | ###### 根据子任务发送情况设置主任务状态
52 | # 查询子任务投递失败数
53 | failure_count = subtask_model_proxy.query_delivery_failure_count(schedule_task_id)
54 | # 如果子任务投递全部失败,则设置当前调度任务为投递失败状态
55 | if failure_count == len(chunk_urls_struct):
56 | schedule_task_proxy.set_many_attr(
57 | obj_id=schedule_task_id, fields_v={'schedule_task_status': 'delivery_failure', 'finished': True}
58 | )
59 | # 如果是临时任务,则直接标记主任务为失败状态
60 | if task_obj.schedule_options.get('schedule_type') == 'instantly':
61 | task_model_proxy.set_many_attr(
62 | obj=task_obj, fields_v={'task_status': 'delivery_failure', 'finished': True}
63 | )
64 |
65 | # 只要有一个投递失败,则标记为部分失败
66 | elif failure_count != 0:
67 | schedule_task_proxy.set_attr_by_id(
68 | obj_id=schedule_task_id, field='schedule_task_status', value='part_delivery_failure')
69 |
70 | # 否则标记全部投递成功
71 | else:
72 | schedule_task_proxy.set_attr_by_id(
73 | obj_id=schedule_task_id, field='schedule_task_status', value='delivery_success')
74 |
75 |
76 | @celery_app.task(name='delete_task')
77 | def delete_task(task_id):
78 | """
79 | 删除任务,因需要删除截图和har文件 故使用异步方式进行删除
80 | :param task_id:
81 | :return:
82 | """
83 |
84 | # 查询所有schedule task
85 | schedule_task_subquery = schedule_task_proxy.db_session.query(schedule_task_proxy.model.id) \
86 | .filter(schedule_task_proxy.model.task_id == task_id).subquery()
87 |
88 | # 查询所有subtask
89 | subtask_subquery = subtask_model_proxy.db_session.query(subtask_model_proxy.model.id).filter(
90 | subtask_model_proxy.model.schedule_task_id.in_(schedule_task_subquery)).subquery()
91 |
92 | ###### 删除结果
93 | result_query = result_model_proxy.self_session.filter(
94 | result_model_proxy.model.subtask_id.in_(subtask_subquery))
95 |
96 | # 删除截图
97 | screenshot_ids = [each.screenshot_id + '.png' for each in
98 | result_query.filter(result_model_proxy.model.screenshot_id.isnot(None)).all()]
99 | remove_files(path='screenshots', file_ids=screenshot_ids)
100 |
101 | # 删除hars
102 | har_ids = [each.har_uuid + '.json' for each in
103 | result_query.filter(result_model_proxy.model.har_uuid.isnot(None)).all()]
104 | remove_files(path='hars', file_ids=har_ids)
105 |
106 | # 删除结果
107 | result_query.delete(synchronize_session=False)
108 | result_model_proxy.safe_commit()
109 |
110 | # 删除Schedule task
111 | schedule_task_proxy.delete_models(ids=schedule_task_subquery, fields='id')
112 |
113 | # 删除subtask
114 | subtask_model_proxy.delete_models(ids=subtask_subquery, fields='id')
115 |
116 | # 删除task_url
117 | task_url_model_proxy.delete_model(task_id=task_id)
118 |
--------------------------------------------------------------------------------
/services/engine/worker/library/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/engine/worker/library/favicon.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import requests
5 |
6 | from urllib.parse import urljoin, urlparse
7 | from bs4 import BeautifulSoup
8 |
9 | favicon_link_rules = [
10 | 'icon',
11 | 'shortcut icon',
12 | 'apple-touch-icon',
13 | 'apple-touch-icon-precomposed',
14 | ]
15 |
16 | meta_names = ['msapplication-TileImage', 'og:image']
17 |
18 | headers = {
19 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
20 | }
21 |
22 |
23 | def get_favicon_link(url, html):
24 | """
25 | 获取网站图标链接
26 | :param url:
27 | :param html:
28 | :return:
29 | """
30 |
31 | # 正则匹配网站源码是否包含图标链接
32 | soup = BeautifulSoup(html, features='html.parser')
33 |
34 | # 查找link标签
35 | for rule in favicon_link_rules:
36 | favicon_tag = soup.find('link', attrs={'rel': lambda r: r and r.lower() == rule, 'href': True})
37 | if favicon_tag:
38 | favicon_href = favicon_tag.get('href', '').strip()
39 | return fmt_link(url, favicon_href)
40 |
41 | # 查找meta标签
42 | for meta_tag in soup.find_all('meta', attrs={'content': True}):
43 | meta_type = meta_tag.get('name') or meta_tag.get('property') or ''.lower()
44 | for name in meta_names:
45 | if meta_type == name.lower():
46 | favicon_href = meta_tag.get('href', '').strip()
47 | return fmt_link(url, favicon_href)
48 |
49 | # 请求根目录下是否存在/favicon.ico文件
50 | root_icon_link = get_root_dir_icon(url)
51 | if root_icon_link:
52 | return root_icon_link, 'ico'
53 |
54 | return None, None
55 |
56 |
57 | def fmt_link(website_url, href):
58 | """
59 | 格式化标签
60 | :param website_url:
61 | :param href:
62 | :return: favicon_link, ext
63 | """
64 |
65 | if not href or href.startswith('data:image/'):
66 | return None, None
67 |
68 | if not urlparse(href).netloc:
69 | href = urljoin(website_url, href)
70 |
71 | if urlparse(href).netloc:
72 | url_parsed = href
73 | else:
74 | url_parsed = urljoin(website_url, href)
75 |
76 | url_parsed = urlparse(url_parsed, scheme=urlparse(website_url).scheme)
77 | _, ext = os.path.splitext(url_parsed.path)
78 | favicon_url = url_parsed.geturl()
79 | try:
80 | response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers)
81 | if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
82 | return favicon_url, ext[1:].lower()
83 | except Exception as e:
84 | return None, None
85 | return None, None
86 |
87 |
88 | def get_root_dir_icon(url):
89 | try:
90 | parsed = urlparse(url)
91 | favicon_url = parsed.scheme + "://" + parsed.netloc + '/favicon.ico'
92 | response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers)
93 | if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
94 | return response.url
95 | except Exception as e:
96 | return
97 |
98 | return
99 |
--------------------------------------------------------------------------------
/services/engine/worker/library/helper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import bisect
4 | import hashlib
5 | import math
6 | import os
7 | import random
8 | import uuid
9 | from datetime import datetime
10 |
11 | import requests
12 | from html2text import HTML2Text
13 |
14 | from webs.api.models.db_proxy import subtask_model_proxy
15 | from webs.core.requests import web_client
16 | from worker.library.favicon import get_favicon_link
17 |
18 |
19 | class WeightedRandomGenerator(object):
20 | def __init__(self, weights):
21 | print(weights)
22 | self.weights = weights
23 | self.totals = []
24 | running_total = 0
25 |
26 | for w in weights:
27 | running_total += w['score']
28 | self.totals.append(running_total)
29 |
30 | def spawn(self):
31 | rnd = random.random() * self.totals[-1]
32 | index = bisect.bisect_right(self.totals, rnd)
33 | return self.weights[index]
34 |
35 | def __call__(self):
36 | return self.spawn()
37 |
38 |
39 | def split_urls(urls):
40 | """对url列表进行拆分"""
41 | if len(urls) > 100:
42 | m = len(urls) // 100
43 | n = int(math.ceil(len(urls) / float(m)))
44 | chunk_list = [urls[i:i + n] for i in range(0, len(urls), n)]
45 | else:
46 | chunk_list = [urls]
47 |
48 | return chunk_list
49 |
50 |
51 | def send(schedule_task_id, url_nested_list, server_info, options):
52 | # 创建子任务模型
53 | subtask_obj = subtask_model_proxy.create(schedule_task_id, server_id=server_info['server_id'])
54 |
55 | # 发送请求
56 | try:
57 | response = web_client.request(
58 | server=server_info['server_name'],
59 | url=server_info['server_address'] + '/crawl_tasks',
60 | method='POST', timeout=60,
61 | json={
62 | 'subtask_id': subtask_obj.id,
63 | 'url_nested_list': url_nested_list,
64 | 'options': options
65 | }
66 | )
67 | failure_msg = '' if response['status'] is True else response['error']['message']
68 | except Exception as e:
69 | failure_msg = e.message
70 | if failure_msg:
71 | # 设置子任务失败原因
72 | subtask_model_proxy.set_many_attr(obj=subtask_obj, fields_v={
73 | 'finished': True,
74 | 'finished_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
75 | 'delivery_failure_msg': failure_msg
76 | })
77 |
78 |
79 | def extract_text(content):
80 | """
81 | 提取网页正文
82 | :param content:
83 | :return:
84 | """
85 |
86 | h = HTML2Text(bodywidth=0)
87 | h.ignore_links = True
88 | h.ignore_images = True
89 | h.ignore_tables = True
90 | h.ignore_emphasis = True
91 | try:
92 | result = h.handle(content).replace('*', '').replace('\n\n', '\n')
93 | except Exception as e:
94 | result = None
95 | return '' if result == '\n' else result
96 |
97 |
98 | def save_favicon(url, html):
99 | """
100 | 保存网站图标
101 | :param url:
102 | :param html:
103 | :return:
104 | """
105 | favicon_link, icon_ext = get_favicon_link(url, html)
106 | if favicon_link:
107 | try:
108 | response = requests.get(favicon_link, stream=True, timeout=10)
109 | except Exception as e:
110 | return None, None
111 | temp_filename = str(uuid.uuid4())
112 | save_path = '/usr/src/app/screenshots/{}.{}'.format(temp_filename, icon_ext)
113 | with open(save_path, 'wb+') as image:
114 | for chunk in response.iter_content(1024):
115 | image.write(chunk)
116 | image.seek(0)
117 | favicon_md5 = hashlib.md5(image.read()).hexdigest()
118 | os.rename(save_path, '/usr/src/app/screenshots/{}.{}'.format(favicon_md5, icon_ext))
119 | return favicon_md5, favicon_link
120 | return None, None
121 |
122 |
123 | def remove_files(path, file_ids):
124 | """
125 | 文件
126 | :return:
127 | """
128 |
129 | for file_id in file_ids:
130 | try:
131 | os.remove(f'/usr/src/app/{path}/{file_id}')
132 | except FileNotFoundError as e:
133 | pass
134 |
--------------------------------------------------------------------------------
/services/engine/worker/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import re
5 |
6 | from rpc.client.callback_client import CallbackClient
7 | from wappalyzer import wappalyzer_handler
8 | from webs.api.models.db_proxy import result_model_proxy, task_model_proxy
9 | from webs.core.requests import web_client
10 | from worker import celery_app
11 | from worker.library.helper import extract_text, save_favicon
12 |
13 |
14 | def callback_http(callback_address, task_obj, result, finished):
15 | """
16 | 回调方式为http
17 | :return:
18 | """
19 | try:
20 | response = web_client.request(
21 | server='callback', method='POST',
22 | url=callback_address,
23 | timeout=60, json={
24 | 'customer_id': task_obj.customer_id,
25 | 'extra_data': task_obj.extra_data,
26 | 'task_id': task_obj.id,
27 | 'finished': finished,
28 | 'result': result
29 | }
30 | )
31 | failure_msg = '' if response['status'] is True else response['error']['message']
32 | except Exception as e:
33 | failure_msg = e.message
34 | if failure_msg and result.get('result_id'):
35 | result_model_proxy.set_attr_by_id(result['result_id'], 'callback_failure_msg', failure_msg)
36 |
37 |
38 | def callback_grpc(callback_address, task_obj, result):
39 | """
40 | 回调方式为rpc
41 | :return:
42 | """
43 |
44 | callback_client = CallbackClient(rpc_server=callback_address)
45 | callback_client.callback_save_result(task_obj, result)
46 |
47 |
48 | @celery_app.task(name='save_base_result')
49 | def save_base_result_by_grpc(**kwargs):
50 | """
51 | 异步回收相关爬取数据
52 | :param kwargs:
53 | :return:
54 | """
55 |
56 | task_obj = task_model_proxy.query_task_obj_by_subtask(subtask_id=kwargs['subtask_id'])
57 | if not task_obj:
58 | return
59 |
60 | # 解析网站编码
61 | try:
62 | m = re.compile(' /etc/timezone
16 |
17 | # set working directory
18 | RUN mkdir -p /usr/src/app
19 | WORKDIR /usr/src/app
20 |
21 | # add and install requirements
22 | COPY ./requirements.txt /usr/src/app/requirements.txt
23 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \
24 | pip install -r requirements.txt -i https://pypi.douban.com/simple
25 |
26 | # add app
27 | COPY . /usr/src/app
28 |
29 | # run server
30 | CMD ["/usr/src/app/entrypoint.sh"]
31 |
--------------------------------------------------------------------------------
/services/spider/Dockerfile-prod:
--------------------------------------------------------------------------------
1 | FROM harbor.socmap.net/crawloop/playwright-xvfb:v1.0.0
2 |
3 | WORKDIR /usr/src/app
4 |
5 | COPY ./requirements.txt /usr/src/app
6 |
7 | RUN apt update && \
8 | apt-get -y install netcat && \
9 | rm -rf /var/lib/apt/lists/*
10 |
11 |
12 | # set timezone
13 | ENV TZ=Asia/Shanghai
14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
15 |
16 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \
17 | pip install -r requirements.txt -i https://pypi.douban.com/simple
18 |
19 | COPY . /usr/src/app
20 |
21 | RUN sh build.sh
22 |
23 | CMD ["/usr/src/app/entrypoint.sh"]
24 |
25 |
26 |
--------------------------------------------------------------------------------
/services/spider/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # 清除缓存目录
4 | find . -type d -name __pycache__ | xargs rm -rf
5 |
6 | # 编译代码
7 | python3 compile.py build_ext --inplace
8 | if [ $? -ne 0 ]; then
9 | exit 1
10 | fi
11 |
12 | # 将.so文件改名
13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
16 |
17 | # 删除.py文件
18 | find ./rpc -name '*.py' | xargs rm -f
19 | find ./webs -name '*.py' | xargs rm -f
20 | find ./worker -name '*.py' | xargs rm -f
21 |
22 | # 清除不需要的文件
23 | rm -rf build
24 | rm -f .gitignore
25 | rm -f compile.py
26 | rm -f build.sh
27 |
--------------------------------------------------------------------------------
/services/spider/build_sentry_ini.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import os
5 |
6 | from configobj import ConfigObj
7 |
8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8')
9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False)
10 | log_ini.write()
11 |
--------------------------------------------------------------------------------
/services/spider/compile.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from Cython.Build import cythonize
4 | from Cython.Distutils import build_ext
5 | from setuptools import setup
6 | from setuptools.extension import Extension
7 |
8 | setup(
9 | ext_modules=cythonize(
10 | [
11 | Extension('rpc.*', ['rpc/*.py']),
12 | Extension('rpc.pb.*', ['rpc/pb/*.py']),
13 | Extension('rpc.client.*', ['rpc/client/*.py']),
14 | Extension('webs.*', ['webs/*.py']),
15 | Extension('webs.api.*', ['webs/api/*.py']),
16 | Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']),
17 | Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']),
18 | Extension('webs.api.models*', ['webs/api/models/*.py']),
19 | Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']),
20 | Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']),
21 | Extension('webs.api.utils.*', ['webs/api/utils/*.py']),
22 | Extension('webs.api.views.*', ['webs/api/views/*.py']),
23 | Extension('worker.*', ['worker/*.py']),
24 | Extension('worker.library.*', ['worker/library/*.py']),
25 | ],
26 | build_dir='build',
27 | compiler_directives=dict(
28 | always_allow_keywords=True, language_level=3
29 | )
30 | ),
31 | cmdclass=dict(
32 | build_ext=build_ext
33 | )
34 | )
35 |
--------------------------------------------------------------------------------
/services/spider/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # web服务
4 | if [ "$ENDPOINT" = "web" ]; then
5 | # 开发环境
6 | if [ "$FLASK_ENV" = "development" ]; then
7 | flask run -h 0.0.0.0 -p 5000
8 |
9 | # 生产环境
10 | elif [ "$FLASK_ENV" = "production" ]; then
11 | python build_sentry_ini.py
12 | gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app
13 | fi
14 |
15 | # 爬取
16 | elif [ "$ENDPOINT" = "fetch" ]; then
17 | # 开启虚拟显示器
18 | echo "开启xvfb"
19 | rm -rf /tmp/.X99-lock
20 | Xvfb -screen 0 1020x720x16 :99 &
21 | export DISPLAY=:99
22 | celery -A worker.celery_app worker -Q priority_fetch -l info -c $WORK_MAX_COUNT --prefetch-multiplier 1 --max-tasks-per-child 1 -n crawl_fetch@%h
23 |
24 | # 保存结果
25 | elif [ "$ENDPOINT" = "results" ]; then
26 | # celery -A worker.celery_app worker -Q results -l info -c 5 --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h
27 | celery -A worker.celery_app worker -Q results -l info --pool=prefork --concurrency=5 --without-heartbeat --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h
28 | fi
29 |
--------------------------------------------------------------------------------
/services/spider/gunicorn_config.py:
--------------------------------------------------------------------------------
1 | # Sample Gunicorn configuration file.
2 |
3 | import multiprocessing as mlp
4 |
5 | # 解决无限递归
6 | import gevent.monkey
7 |
8 | gevent.monkey.patch_all()
9 |
10 | #
11 | # Server socket
12 | #
13 | # bind - The socket to bind.
14 | #
15 | # A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'.
16 | # An IP is a valid HOST.
17 | #
18 | # backlog - The number of pending connections. This refers
19 | # to the number of clients that can be waiting to be
20 | # served. Exceeding this number results in the client
21 | # getting an error when attempting to connect. It should
22 | # only affect servers under significant load.
23 | #
24 | # Must be a positive integer. Generally set in the 64-2048
25 | # range.
26 | #
27 |
28 | bind = '0.0.0.0:5000'
29 | backlog = 2048
30 |
31 | #
32 | # Worker processes
33 | #
34 | # workers - The number of worker processes that this server
35 | # should keep alive for handling requests.
36 | #
37 | # A positive integer generally in the 2-4 x $(NUM_CORES)
38 | # range. You'll want to vary this a bit to find the best
39 | # for your particular application's work load.
40 | #
41 | # worker_class - The type of workers to use. The default
42 | # sync class should handle most 'normal' types of work
43 | # loads. You'll want to read
44 | # http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type
45 | # for information on when you might want to choose one
46 | # of the other worker classes.
47 | #
48 | # A string referring to a Python path to a subclass of
49 | # gunicorn.workers.base.Worker. The default provided values
50 | # can be seen at
51 | # http://docs.gunicorn.org/en/latest/settings.html#worker-class
52 | #
53 | # worker_connections - For the eventlet and gevent worker classes
54 | # this limits the maximum number of simultaneous clients that
55 | # a single process can handle.
56 | #
57 | # A positive integer generally set to around 1000.
58 | #
59 | # timeout - If a worker does not notify the master process in this
60 | # number of seconds it is killed and a new worker is spawned
61 | # to replace it.
62 | #
63 | # Generally set to thirty seconds. Only set this noticeably
64 | # higher if you're sure of the repercussions for sync workers.
65 | # For the non sync workers it just means that the worker
66 | # process is still communicating and is not tied to the length
67 | # of time required to handle a single request.
68 | #
69 | # keepalive - The number of seconds to wait for the next request
70 | # on a Keep-Alive HTTP connection.
71 | #
72 | # A positive integer. Generally set in the 1-5 seconds range.
73 | #
74 |
75 | # Number of processes
76 | workers = mlp.cpu_count() * 2 + 1
77 |
78 | # Threads
79 | threads = mlp.cpu_count() * 2
80 |
81 | worker_class = 'gevent'
82 | worker_connections = 1000
83 | timeout = 30
84 | keepalive = 2
85 |
86 | #
87 | # spew - Install a trace function that spews every line of Python
88 | # that is executed when running the server. This is the
89 | # nuclear option.
90 | #
91 | # True or False
92 | #
93 |
94 | spew = False
95 |
96 | #
97 | # Server mechanics
98 | #
99 | # daemon - Detach the main Gunicorn process from the controlling
100 | # terminal with a standard fork/fork sequence.
101 | #
102 | # True or False
103 | #
104 | # raw_env - Pass environment variables to the execution environment.
105 | #
106 | # pidfile - The path to a pid file to write
107 | #
108 | # A path string or None to not write a pid file.
109 | #
110 | # user - Switch worker processes to run as this user.
111 | #
112 | # A valid user id (as an integer) or the name of a user that
113 | # can be retrieved with a call to pwd.getpwnam(value) or None
114 | # to not change the worker process user.
115 | #
116 | # group - Switch worker process to run as this group.
117 | #
118 | # A valid group id (as an integer) or the name of a user that
119 | # can be retrieved with a call to pwd.getgrnam(value) or None
120 | # to change the worker processes group.
121 | #
122 | # umask - A mask for file permissions written by Gunicorn. Note that
123 | # this affects unix socket permissions.
124 | #
125 | # A valid value for the os.umask(mode) call or a string
126 | # compatible with int(value, 0) (0 means Python guesses
127 | # the base, so values like "0", "0xFF", "0022" are valid
128 | # for decimal, hex, and octal representations)
129 | #
130 | # tmp_upload_dir - A directory to store temporary request data when
131 | # requests are read. This will most likely be disappearing soon.
132 | #
133 | # A path to a directory where the process owner can write. Or
134 | # None to signal that Python should choose one on its own.
135 | #
136 |
137 | #
138 | # Logging
139 | #
140 | # logfile - The path to a log file to write to.
141 | #
142 | # A path string. "-" means log to stdout.
143 | #
144 | # loglevel - The granularity of log output
145 | #
146 | # A string of "debug", "info", "warning", "error", "critical"
147 | #
148 |
149 | errorlog = '-'
150 | loglevel = 'error'
151 | accesslog = '-'
152 | access_log_format = '{"request_address": "%(h)s", ' \
153 | '"request_time": "%(t)s", ' \
154 | '"request": "%(r)s", ' \
155 | '"http_status_code": "%(s)s", ' \
156 | '"http_request_url": "%(U)s", ' \
157 | '"http_query_string": "%(q)s", ' \
158 | '"request_headers": {' \
159 | '"content-type": "%({content-type}i)s", ' \
160 | '"content-length": "%({content-length}i)s", ' \
161 | '"user-agent": "%(a)s"' \
162 | '}}'
163 |
--------------------------------------------------------------------------------
/services/spider/gunicorn_logging.ini:
--------------------------------------------------------------------------------
1 | # Logging configuration
2 |
3 | [loggers]
4 | keys = root, gunicorn.access, gunicorn.error
5 |
6 | [handlers]
7 | keys = access, error, sentry
8 |
9 | [formatters]
10 | keys = json, generic
11 |
12 | # Root logger
13 | # The root logger sends messages to the console and to Sentry.
14 | [logger_root]
15 | handlers = error, sentry
16 |
17 | # Gunicorn loggers
18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'.
19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation.
20 | # Only the critical logger has a handler to send messages to Sentry.
21 |
22 | [logger_gunicorn.access]
23 | level = INFO
24 | handlers = access
25 | propagate = 0
26 | qualname = gunicorn.access
27 |
28 | [logger_gunicorn.error]
29 | level = ERROR
30 | handlers = error, sentry
31 | propagate = 0
32 | qualname = gunicorn.error
33 |
34 | # Handlers
35 | [handler_access]
36 | class = StreamHandler
37 | formatter = json
38 | args = (sys.stdout, )
39 |
40 | [handler_error]
41 | class = StreamHandler
42 | formatter = json
43 | args = (sys.stderr,)
44 |
45 | [handler_sentry]
46 | class = raven.handlers.logging.SentryHandler
47 | level = ERROR
48 | formatter = generic
49 | sentry_dsn = example
50 | args = [%(sentry_dsn)s]
51 |
52 | [formatter_generic]
53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s
54 | [formatter_json]
55 | class = webs.api.utils.loggers.JSONFormatter
--------------------------------------------------------------------------------
/services/spider/manage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask.cli import FlaskGroup
4 | from webs import create_app
5 |
6 | app = create_app()
7 | cli = FlaskGroup(create_app=create_app)
8 |
9 | if __name__ == '__main__':
10 | cli()
11 |
--------------------------------------------------------------------------------
/services/spider/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
--------------------------------------------------------------------------------
/services/spider/migrations/alembic.ini:
--------------------------------------------------------------------------------
1 | # A generic, single database configuration.
2 |
3 | [alembic]
4 | # template used to generate migration files
5 | # file_template = %%(rev)s_%%(slug)s
6 |
7 | # set to 'true' to run the environment during
8 | # the 'revision' command, regardless of autogenerate
9 | # revision_environment = false
10 |
11 |
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic
15 |
16 | [handlers]
17 | keys = console
18 |
19 | [formatters]
20 | keys = generic
21 |
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 |
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 |
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 |
37 | [handler_console]
38 | class = StreamHandler
39 | args = (sys.stderr,)
40 | level = NOTSET
41 | formatter = generic
42 |
43 | [formatter_generic]
44 | format = %(levelname)-5.5s [%(name)s] %(message)s
45 | datefmt = %H:%M:%S
46 |
--------------------------------------------------------------------------------
/services/spider/migrations/env.py:
--------------------------------------------------------------------------------
1 | from __future__ import with_statement
2 |
3 | import logging
4 | from logging.config import fileConfig
5 |
6 | from sqlalchemy import engine_from_config
7 | from sqlalchemy import pool
8 |
9 | from alembic import context
10 |
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 |
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | fileConfig(config.config_file_name)
18 | logger = logging.getLogger('alembic.env')
19 |
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | from flask import current_app
25 | config.set_main_option('sqlalchemy.url',
26 | current_app.config.get('SQLALCHEMY_DATABASE_URI'))
27 | target_metadata = current_app.extensions['migrate'].db.metadata
28 |
29 | # other values from the config, defined by the needs of env.py,
30 | # can be acquired:
31 | # my_important_option = config.get_main_option("my_important_option")
32 | # ... etc.
33 |
34 |
35 | def run_migrations_offline():
36 | """Run migrations in 'offline' mode.
37 |
38 | This configures the context with just a URL
39 | and not an Engine, though an Engine is acceptable
40 | here as well. By skipping the Engine creation
41 | we don't even need a DBAPI to be available.
42 |
43 | Calls to context.execute() here emit the given string to the
44 | script output.
45 |
46 | """
47 | url = config.get_main_option("sqlalchemy.url")
48 | context.configure(
49 | url=url, target_metadata=target_metadata, literal_binds=True
50 | )
51 |
52 | with context.begin_transaction():
53 | context.run_migrations()
54 |
55 |
56 | def run_migrations_online():
57 | """Run migrations in 'online' mode.
58 |
59 | In this scenario we need to create an Engine
60 | and associate a connection with the context.
61 |
62 | """
63 |
64 | # this callback is used to prevent an auto-migration from being generated
65 | # when there are no changes to the schema
66 | # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
67 | def process_revision_directives(context, revision, directives):
68 | if getattr(config.cmd_opts, 'autogenerate', False):
69 | script = directives[0]
70 | if script.upgrade_ops.is_empty():
71 | directives[:] = []
72 | logger.info('No changes in schema detected.')
73 |
74 | connectable = engine_from_config(
75 | config.get_section(config.config_ini_section),
76 | prefix='sqlalchemy.',
77 | poolclass=pool.NullPool,
78 | )
79 |
80 | with connectable.connect() as connection:
81 | context.configure(
82 | connection=connection,
83 | target_metadata=target_metadata,
84 | process_revision_directives=process_revision_directives,
85 | **current_app.extensions['migrate'].configure_args
86 | )
87 |
88 | with context.begin_transaction():
89 | context.run_migrations()
90 |
91 |
92 | if context.is_offline_mode():
93 | run_migrations_offline()
94 | else:
95 | run_migrations_online()
96 |
--------------------------------------------------------------------------------
/services/spider/migrations/script.py.mako:
--------------------------------------------------------------------------------
1 | """${message}
2 |
3 | Revision ID: ${up_revision}
4 | Revises: ${down_revision | comma,n}
5 | Create Date: ${create_date}
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 |
18 |
19 | def upgrade():
20 | ${upgrades if upgrades else "pass"}
21 |
22 |
23 | def downgrade():
24 | ${downgrades if downgrades else "pass"}
25 |
--------------------------------------------------------------------------------
/services/spider/migrations/versions/81a88acb3641_记录cookies.py:
--------------------------------------------------------------------------------
1 | """记录cookies
2 |
3 | Revision ID: 81a88acb3641
4 | Revises: 8efa2b9dcc87
5 | Create Date: 2020-12-22 15:37:26.700404
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '81a88acb3641'
14 | down_revision = '8efa2b9dcc87'
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('results', sa.Column('cookies', postgresql.JSONB(astext_type=sa.Text()), nullable=True))
22 | # ### end Alembic commands ###
23 |
24 |
25 | def downgrade():
26 | # ### commands auto generated by Alembic - please adjust! ###
27 | op.drop_column('results', 'cookies')
28 | # ### end Alembic commands ###
29 |
--------------------------------------------------------------------------------
/services/spider/migrations/versions/8efa2b9dcc87_init.py:
--------------------------------------------------------------------------------
1 | """init
2 |
3 | Revision ID: 8efa2b9dcc87
4 | Revises:
5 | Create Date: 2020-12-08 10:22:43.545415
6 |
7 | """
8 | from alembic import op
9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 |
12 | # revision identifiers, used by Alembic.
13 | revision = '8efa2b9dcc87'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 |
18 |
19 | def upgrade():
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.create_table('crawl_tasks',
22 | sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False),
23 | sa.Column('subtask_id', sa.Integer(), nullable=False),
24 | sa.Column('url_nested_list', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
25 | sa.Column('process_state', sa.String(length=30), server_default='readying', nullable=True),
26 | sa.Column('failure_url_ids', sa.ARRAY(sa.Integer()), server_default='{}', nullable=True),
27 | sa.Column('finished_at', sa.TIMESTAMP(), nullable=True),
28 | sa.Column('options', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
29 | sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
30 | sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
31 | sa.PrimaryKeyConstraint('id')
32 | )
33 | op.create_index(op.f('ix_crawl_tasks_create_time'), 'crawl_tasks', ['create_time'], unique=False)
34 | op.create_index(op.f('ix_crawl_tasks_subtask_id'), 'crawl_tasks', ['subtask_id'], unique=False)
35 | op.create_index(op.f('ix_crawl_tasks_update_time'), 'crawl_tasks', ['update_time'], unique=False)
36 | op.create_table('results',
37 | sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False),
38 | sa.Column('subtask_id', sa.Integer(), nullable=False),
39 | sa.Column('url_id', sa.Integer(), nullable=False),
40 | sa.Column('url_address', sa.String(length=1024), nullable=False),
41 | sa.Column('http_code', sa.Integer(), nullable=True),
42 | sa.Column('title', sa.Text(), nullable=True),
43 | sa.Column('content', sa.Text(), nullable=True),
44 | sa.Column('current_url', sa.String(length=1024), nullable=True),
45 | sa.Column('redirect_chain', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
46 | sa.Column('response_headers', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
47 | sa.Column('har_uuid', sa.String(length=128), nullable=True),
48 | sa.Column('screenshot_id', sa.String(length=128), nullable=True),
49 | sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
50 | sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
51 | sa.PrimaryKeyConstraint('id')
52 | )
53 | op.create_index(op.f('ix_results_create_time'), 'results', ['create_time'], unique=False)
54 | op.create_index(op.f('ix_results_subtask_id'), 'results', ['subtask_id'], unique=False)
55 | op.create_index(op.f('ix_results_update_time'), 'results', ['update_time'], unique=False)
56 | op.create_index(op.f('ix_results_url_id'), 'results', ['url_id'], unique=False)
57 | # ### end Alembic commands ###
58 |
59 |
60 | def downgrade():
61 | # ### commands auto generated by Alembic - please adjust! ###
62 | op.drop_index(op.f('ix_results_url_id'), table_name='results')
63 | op.drop_index(op.f('ix_results_update_time'), table_name='results')
64 | op.drop_index(op.f('ix_results_subtask_id'), table_name='results')
65 | op.drop_index(op.f('ix_results_create_time'), table_name='results')
66 | op.drop_table('results')
67 | op.drop_index(op.f('ix_crawl_tasks_update_time'), table_name='crawl_tasks')
68 | op.drop_index(op.f('ix_crawl_tasks_subtask_id'), table_name='crawl_tasks')
69 | op.drop_index(op.f('ix_crawl_tasks_create_time'), table_name='crawl_tasks')
70 | op.drop_table('crawl_tasks')
71 | # ### end Alembic commands ###
72 |
--------------------------------------------------------------------------------
/services/spider/requirements.txt:
--------------------------------------------------------------------------------
1 | alembic==1.4.3
2 | amqp==2.6.1
3 | appdirs==1.4.4
4 | billiard==3.6.3.0
5 | celery==4.3.0
6 | certifi==2020.11.8
7 | chardet==3.0.4
8 | click==7.1.2
9 | configobj==5.0.6
10 | Cython==0.29.21
11 | Flask==1.1.2
12 | Flask-Migrate==2.4.0
13 | Flask-Redis==0.3.0
14 | Flask-SQLAlchemy==2.3.2
15 | gevent==1.4.0
16 | greenlet==0.4.15
17 | grpcio==1.33.2
18 | grpcio-tools==1.33.2
19 | gunicorn==19.9.0
20 | idna==2.8
21 | importlib-metadata==2.0.0
22 | itsdangerous==1.1.0
23 | Jinja2==2.11.2
24 | kombu==4.6.11
25 | Mako==1.1.3
26 | MarkupSafe==1.1.1
27 | marshmallow==2.19.2
28 | protobuf==3.14.0
29 | psutil==5.7.3
30 | psycopg2-binary==2.7.6.1
31 | pyee==7.0.4
32 | pyppeteer==0.2.2
33 | python-dateutil==2.8.1
34 | python-editor==1.0.4
35 | pytz==2020.4
36 | raven==6.10.0
37 | redis==3.5.3
38 | requests==2.22.0
39 | six==1.15.0
40 | SQLAlchemy==1.3.20
41 | tqdm==4.52.0
42 | urllib3==1.25.11
43 | vine==1.3.0
44 | webargs==4.0.0
45 | websockets==8.1
46 | Werkzeug==1.0.1
47 | zipp==3.4.0
48 | zope.event==4.5.0
49 | zope.interface==5.2.0
50 |
--------------------------------------------------------------------------------
/services/spider/rpc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/rpc/client/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/rpc/client/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import os
4 |
5 | import grpc
6 |
7 | from rpc.pb import result_pb2
8 | from rpc.pb.result_pb2_grpc import ResultStub
9 |
10 | CHUNK_SIZE = 10 * 1024
11 |
12 |
13 | def get_file_chunks(filename, folder_path):
14 | yield result_pb2.StreamUploadPictureRequest(filename=filename)
15 | with open(f'/usr/src/app/{folder_path}/' + filename, 'rb') as f:
16 | while True:
17 | piece = f.read(CHUNK_SIZE)
18 | if len(piece) == 0:
19 | return
20 | yield result_pb2.StreamUploadPictureRequest(file_data={"buffer": piece})
21 |
22 |
23 | def remove_file(file_path):
24 | """
25 | 删除文件
26 | :param file_path:
27 | :return:
28 | """
29 |
30 | try:
31 | os.remove(file_path)
32 | except (NotImplementedError, FileNotFoundError):
33 | pass
34 |
35 |
36 | class ResultClient(object):
37 |
38 | def __init__(self, rpc_server):
39 | # RPC服务器信道
40 | channel = grpc.insecure_channel(target=f'{rpc_server}', options=[
41 | ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024),
42 | ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024),
43 | ])
44 |
45 | # 获取Result grpc服务对象
46 | self.stub = ResultStub(channel)
47 |
48 | def save_base_result(self, subtask_id, url_id, url_address, finished_at, **kwargs):
49 | """保存爬虫基本信息"""
50 |
51 | # 返回头部序列化
52 | kwargs['response_headers'] = self.dic2json(kwargs.pop('response_headers', {}))
53 |
54 | # 生成状态码
55 | kwargs['http_code'] = kwargs['redirect_chain'][-1]['redirect_http_code'] if kwargs['redirect_chain'] else None
56 |
57 | # 去除firefox和chrome默认content
58 | if kwargs['content'] and (kwargs['content'].startswith(
59 | '')
60 | or kwargs['content'] == ''):
61 | kwargs['content'] = None
62 |
63 | # # http交互过程序列化
64 | # kwargs['http_archive'] = self.dic2json(kwargs.pop('http_archive', []))
65 | self.stub.SaveBaseResult(
66 | result_pb2.SaveBaseResultRequest(
67 | subtask_id=subtask_id, url_id=url_id, url_address=url_address,
68 | finished_at=finished_at, **kwargs),
69 | timeout=30
70 | )
71 |
72 | def upload_screenshot(self, screenshot_name):
73 | """上传截图"""
74 | chunks_generator = get_file_chunks(screenshot_name, folder_path='screenshots')
75 | response = self.stub.StreamUploadPicture(chunks_generator)
76 | file_path = f'/usr/src/app/screenshots/{screenshot_name}'
77 | assert response.length == os.path.getsize(file_path)
78 | remove_file(file_path)
79 |
80 | def set_subtask_status(self, subtask_id, status, finished_at):
81 | """标记子任务爬取状态"""
82 | self.stub.SetSubTaskStatus(
83 | result_pb2.SetSubTaskStatusRequest(
84 | subtask_id=subtask_id,
85 | status=status,
86 | finished_at=finished_at
87 | ),
88 | timeout=30
89 | )
90 |
91 | def upload_har_file(self, har_file_name):
92 | """上传har文件"""
93 | chunks_generator = get_file_chunks(har_file_name, folder_path='hars')
94 | response = self.stub.StreamUploadHarFile(chunks_generator)
95 | file_path = f'/usr/src/app/hars/{har_file_name}'
96 | assert response.length == os.path.getsize(file_path)
97 | remove_file(file_path)
98 |
99 | @staticmethod
100 | def dic2json(dic):
101 | """某些字段转换为json"""
102 | return json.dumps(dic, ensure_ascii=False)
103 |
--------------------------------------------------------------------------------
/services/spider/rpc/pb/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
--------------------------------------------------------------------------------
/services/spider/webs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | from flask import Flask
6 |
7 | from webs.api.utils.requests import before_request_middleware, \
8 | after_request_middleware, teardown_appcontext_middleware
9 | from webs.api.utils.responses import JSONResponse, app_error_handler
10 | from webs.api.utils.routers import register_routes as init_routes
11 | from webs.api.utils.settings import init_db
12 |
13 |
14 | def create_app():
15 | # instantiate the app
16 | app = Flask(__name__)
17 |
18 | # set config
19 | app_settings = os.getenv('APP_SETTINGS')
20 | app.config.from_object(app_settings)
21 |
22 | # register all blueprints
23 | init_routes(app=app)
24 |
25 | # register custom response class
26 | app.response_class = JSONResponse
27 |
28 | # register custom error handler
29 | app_error_handler(app=app)
30 |
31 | # register before request middleware
32 | before_request_middleware(app=app)
33 |
34 | # register after request middleware
35 | after_request_middleware(app=app)
36 |
37 | # register after app context teardown middleware
38 | teardown_appcontext_middleware(app=app)
39 |
40 | # set up extensions
41 | app_db = init_db(app=app)
42 |
43 | # shell context for flask cli
44 | @app.shell_context_processor
45 | def ctx():
46 | return {'app': app, 'db': app_db}
47 |
48 | return app
49 |
--------------------------------------------------------------------------------
/services/spider/webs/api/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/webs/api/bizs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/webs/api/bizs/crawl_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models.db_proxy import crawl_task_model_proxy
4 |
5 |
6 | class CrawlTaskBiz(object):
7 |
8 | def __init__(self):
9 | pass
10 |
11 | def create_crawl_task(self, subtask_id, url_nested_list, options={}):
12 | """
13 | 调度爬虫
14 | :param subtask_id:
15 | :param url_nested_list:
16 | :param options:
17 | :return:
18 | """
19 |
20 | # 创建CrawlTask对象
21 | crawl_task_obj = crawl_task_model_proxy.create(
22 | subtask_id=subtask_id, url_nested_list=url_nested_list,
23 | process_state='readying', options=options)
24 |
25 | # 异步抓取
26 | from worker import celery_app
27 | celery_app.send_task(
28 | name='fetch_tasks', queue='priority_fetch', priority=options['priority'],
29 | kwargs={'crawl_task_id': crawl_task_obj.id})
30 |
--------------------------------------------------------------------------------
/services/spider/webs/api/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/webs/api/exceptions/customs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from werkzeug.exceptions import BadRequest, \
5 | NotFound, Unauthorized, Forbidden, InternalServerError, Conflict
6 |
7 |
8 | class CustomException(Exception):
9 | """Custom JSON based exception."""
10 |
11 | status_code = BadRequest.code
12 | message = ""
13 |
14 | def __init__(self, message=None, status_code=None):
15 | """
16 | :param status_code: response status_code
17 | :param message: exception message
18 | """
19 |
20 | Exception.__init__(self)
21 |
22 | if message is not None:
23 | self.message = message
24 | if status_code is not None:
25 | self.status_code = status_code
26 |
27 | def to_dict(self):
28 | return {
29 | "status": False,
30 | "error": {
31 | "message": self.message,
32 | "type": str(self.__class__.__name__)
33 | }
34 | }
35 |
36 |
37 | class InvalidContentType(CustomException):
38 | """
39 | Raised when an invalid Content-Type is provided.
40 | """
41 |
42 | status_code = BadRequest.code
43 |
44 |
45 | class UnauthorizedAPIRequest(CustomException):
46 | """
47 | Raise if the user is not authorized. Also used if you want to use HTTP
48 | basic auth.
49 | """
50 |
51 | status_code = Unauthorized.code
52 |
53 |
54 | class InvalidPermissions(CustomException):
55 | """
56 | Raise if the user doesn't have the permission for the requested resource
57 | but was authenticated.
58 | """
59 |
60 | status_code = Forbidden.code
61 |
62 |
63 | class InvalidAPIRequest(CustomException):
64 | """
65 | Raised when an invalid request has been made.
66 | (e.g. accessed unexisting url, the schema validation did
67 | not pass)
68 | """
69 |
70 | status_code = BadRequest.code
71 |
72 |
73 | class ServerError(CustomException):
74 | """
75 | Generic internal error.
76 | Inherit this error for all subsequent
77 | errors that are related to database.
78 | """
79 |
80 | status_code = InternalServerError.code
81 |
82 |
83 | class DatabaseError(CustomException):
84 | """
85 | Generic database interaction error.
86 | Inherit this error for all subsequent
87 | errors that are related to database.
88 | """
89 |
90 | status_code = InternalServerError.code
91 |
92 |
93 | class RecordNotFound(DatabaseError):
94 | """
95 | Raised when the record was not found in the database.
96 | """
97 |
98 | status_code = NotFound.code
99 |
100 |
101 | class RecordAlreadyExists(DatabaseError):
102 | """
103 | Raised in the case of violation of a unique constraint.
104 | """
105 |
106 | status_code = Conflict.code
107 |
108 |
109 | class PublishError(CustomException):
110 | """
111 | Raised in the case of violation of a publish error.
112 | """
113 |
114 | status_code = InternalServerError.code
115 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .base_model import db, redis_store
4 | from .crawl_task import CrawlTask
5 | from .result import Result
6 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/base_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask_sqlalchemy import SQLAlchemy
4 | from flask_redis import FlaskRedis
5 |
6 | db = SQLAlchemy()
7 | redis_store = FlaskRedis()
8 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/crawl_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | 底层爬虫子任务与Url映射关系
6 | """
7 |
8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY
9 | from sqlalchemy.dialects.postgresql import JSONB
10 |
11 | from webs.api.models import db
12 |
13 |
14 | class CrawlTask(db.Model):
15 | __tablename__ = 'crawl_tasks'
16 |
17 | id = Column(BigInteger, primary_key=True, autoincrement=True)
18 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id
19 | url_nested_list = Column(JSONB) # [{"url_id": xxx, "url_address": xxx, 'url_options': {}}]
20 | process_state = Column(String(30), server_default='readying') # readying Started finished
21 | failure_url_ids = Column(ARRAY(Integer), server_default='{}') # 爬取失败url
22 | finished_at = Column(TIMESTAMP) # 完成时间
23 | options = Column(JSONB) # 爬取参数
24 |
25 | # success_count = Column(Integer) # 爬取成功数
26 | # failure_count = Column(Integer) # 爬取失败数
27 |
28 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
29 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
30 |
31 | def __repr__(self):
32 | return f''
33 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .crawl_task import CrawlTaskModelProxy
4 | from .result import ResultModelProxy
5 |
6 | crawl_task_model_proxy = CrawlTaskModelProxy()
7 | result_model_proxy = ResultModelProxy()
8 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/crawl_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import CrawlTask
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class CrawlTaskModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = CrawlTask
11 |
12 | def create(self, **kwargs):
13 | """
14 | 创建CrawlTask对象
15 | """
16 |
17 | crawl_task_obj = CrawlTask(
18 | subtask_id=kwargs['subtask_id'], url_nested_list=kwargs['url_nested_list'],
19 | process_state=kwargs['process_state'], options=kwargs['options'])
20 | self.db_session.add(crawl_task_obj)
21 | self.safe_commit()
22 | return crawl_task_obj
23 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webs.api.models import Result
4 | from webs.api.models.db_proxy.base import BaseModelProxy
5 |
6 |
7 | class ResultModelProxy(BaseModelProxy):
8 | def __init__(self):
9 | super().__init__()
10 | self.model = Result
11 |
12 | def create(self, subtask_id, url_id, url_address, **kwargs):
13 | """
14 | 保存爬取结果
15 | :param subtask_id:
16 | :param url_id:
17 | :param url_address:
18 | :param kwargs:
19 | :return:
20 | """
21 |
22 | result_obj = Result(
23 | subtask_id=subtask_id, url_id=url_id, url_address=url_address,
24 | http_code=kwargs.get('http_code'), title=kwargs.get('title'),
25 | content=kwargs.get('content'), current_url=kwargs.get('current_url'),
26 | har_uuid=kwargs.get('har_uuid'), screenshot_id=kwargs.get('screenshot_id'),
27 | response_headers=kwargs.get('response_headers', {}), redirect_chain=kwargs.get('redirect_chain', []),
28 | cookies=kwargs.get('cookies', [])
29 | )
30 | self.db_session.add(result_obj)
31 | self.safe_commit()
32 | return result_obj
33 |
34 | def query_already_crawl_url_ids(self, subtask_id):
35 | """
36 | 查询已经抓取过的url
37 | :param subtask_id:
38 | :return:
39 | """
40 |
41 | query = self.db_session.query(self.model.url_id).filter(self.model.subtask_id == subtask_id).all()
42 | return [each[0] for each in query]
43 |
--------------------------------------------------------------------------------
/services/spider/webs/api/models/result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | 底层存储结果值 作为备份使用
6 | """
7 |
8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text
9 | from sqlalchemy.dialects.postgresql import JSONB
10 |
11 | from webs.api.models import db
12 |
13 |
14 | class Result(db.Model):
15 | __tablename__ = 'results'
16 |
17 | id = Column(BigInteger, primary_key=True, autoincrement=True)
18 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id
19 | url_id = Column(Integer, nullable=False, index=True) # url id
20 | url_address = Column(String(1024), nullable=False) # url 地址
21 | http_code = Column(Integer) # 网站状态码
22 | title = Column(Text) # 网站标题
23 | content = Column(Text) # 网站内容
24 | current_url = Column(String(1024)) # 网站最后相应的地址
25 | redirect_chain = Column(JSONB) # 重定向链接
26 | response_headers = Column(JSONB) # response headers
27 | har_uuid = Column(String(128)) # 网站交互过程存储文件
28 | screenshot_id = Column(String(128)) # 截图Id
29 | cookies = Column(JSONB) # cookies
30 |
31 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
32 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
33 |
34 | def __repr__(self):
35 | return f''
36 |
--------------------------------------------------------------------------------
/services/spider/webs/api/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from webs.api.exceptions.customs import InvalidAPIRequest
5 |
6 |
7 | class LengthChecker(object):
8 | """字段长度校验"""
9 |
10 | def __init__(self, sign, length):
11 | self.sign = sign
12 | self.length = length
13 |
14 | def __call__(self, verified):
15 | if verified is not None and len(verified) > self.length:
16 | raise InvalidAPIRequest(f'{self.sign}长度过长!')
17 |
18 |
19 | class OneOf(object):
20 | """Validator which succeeds if ``value`` is a member of ``choices``"""
21 |
22 | def __init__(self, choices):
23 | self.choices = choices
24 |
25 | def __call__(self, verified):
26 | if verified not in self.choices:
27 | raise InvalidAPIRequest(f'请选择{self.choices}其中之一!')
28 |
--------------------------------------------------------------------------------
/services/spider/webs/api/schemas/crawl_tasks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from webargs import fields
4 |
5 | from webs.api.schemas import OneOf
6 |
7 | create_crawl_task_schema = {
8 | 'subtask_id': fields.Int(required=True),
9 | 'url_nested_list': fields.DelimitedList(fields.Nested({
10 | 'url_id': fields.Int(required=True),
11 | 'url_address': fields.Str(required=True),
12 | 'url_options': fields.Dict(missing={})
13 | }), required=True),
14 | 'options': fields.Nested({
15 | 'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])),
16 | 'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])), # 任务优先级
17 | 'headless': fields.Bool(missing=False), # 有头/无头模式 默认使用有头模式
18 | 'debug': fields.Bool(missing=False), # 是否开启调试模式,
19 | 'referer': fields.Str(), # 网站来路地址
20 | 'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])), # 并发数
21 | 'url_timeout': fields.Int(missing=30), # 单个url超时时间
22 | 'enabled_render_js': fields.Bool(missing=True),
23 | 'page_wait_time': fields.Int(missing=3), # 等待页面js渲染时间
24 | 'ignore_ssl': fields.Bool(missing=True), # 是否忽略证书错误
25 | 'screenshot': fields.Bool(missing=False), # 是否截图
26 | 'proxy_url': fields.Str(), # 代理
27 | 'user_agent': fields.Str(), # Ua
28 | 'record_har': fields.Bool(missing=False), # 请求networks
29 | 'record_redirect': fields.Bool(missing=False), # 是否记录重定向链接
30 | 'use_browser_cache': fields.Bool(missing=True), # 是否使用浏览器缓存
31 | 'use_result_cache': fields.Bool(missing=True), # 是否使用结果缓存
32 | 'wappalyzer': fields.Bool(missing=False), # 是否使用指纹识别
33 | 'wait_until': fields.Str(
34 | missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])), # 控制页面何时加载成功
35 | 'rpc_server': fields.Str(required=True)
36 | }, missing={})
37 | }
38 |
--------------------------------------------------------------------------------
/services/spider/webs/api/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/webs/api/utils/loggers.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import logging
5 | import socket
6 | import sys
7 | import traceback
8 | from datetime import datetime
9 |
10 | try:
11 | import simplejson as json
12 | except ImportError:
13 | import json
14 |
15 |
16 | class JSONFormatter(logging.Formatter):
17 | """
18 | JSON formatter for python logging
19 |
20 | You can pass additional tags on a per message basis using the
21 | key "tags" in the extra parameter.
22 | eg: logger.error('hello world!', extra={"tags": ["hello=world"]})
23 | """
24 |
25 | def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON',
26 | indent=None):
27 | """
28 | :param tags: a list of tags to add to every messages
29 | :hostname: force a specific hostname
30 | :fqdn: a boolean to use the FQDN instead of the machine's hostname
31 | :message_type: the message type for Logstash formatters
32 | :indent: indent level of the JSON output
33 | """
34 | self.message_type = message_type
35 | self.tags = tags if tags is not None else []
36 | self.extra_tags = []
37 | self.indent = indent
38 |
39 | if hostname:
40 | self.host = hostname
41 | elif fqdn:
42 | self.host = socket.getfqdn()
43 | else:
44 | self.host = socket.gethostname()
45 |
46 | def get_extra_fields(self, record):
47 | # The list contains all the attributes listed in
48 | # http://docs.python.org/library/logging.html#logrecord-attributes
49 | skip_list = [
50 | 'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args',
51 | 'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg',
52 | 'msecs', 'msecs', 'message', 'name', 'pathname', 'process',
53 | 'processName', 'relativeCreated', 'thread', 'threadName', 'extra']
54 |
55 | if sys.version_info < (3, 0):
56 | easy_types = (str, bool, dict, float, int, list, type(None))
57 | else:
58 | easy_types = (str, bool, dict, float, int, list, type(None))
59 |
60 | fields = {}
61 |
62 | self.extra_tags = []
63 | for key, value in record.__dict__.items():
64 | if key not in skip_list:
65 | if key == 'tags' and isinstance(value, list):
66 | self.extra_tags = value
67 | elif isinstance(value, easy_types):
68 | fields[key] = value if value else "null"
69 | else:
70 | fields[key] = repr(value)
71 |
72 | return fields
73 |
74 | def get_debug_fields(self, record):
75 | if record.exc_info:
76 | exc_info = self.format_exception(record.exc_info)
77 | else:
78 | exc_info = record.exc_text
79 | return {
80 | 'exc_info': exc_info,
81 | 'filename': record.filename,
82 | 'lineno': record.lineno,
83 | }
84 |
85 | @classmethod
86 | def format_source(cls, message_type, host, path):
87 | return "%s://%s/%s" % (message_type, host, path)
88 |
89 | @classmethod
90 | def format_timestamp(cls, time):
91 | return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X"))
92 |
93 | @classmethod
94 | def format_exception(cls, exc_info):
95 | return ''.join(traceback.format_exception(*exc_info)) if exc_info else ''
96 |
97 | @classmethod
98 | def serialize(cls, message, indent=None):
99 | return json.dumps(message, ensure_ascii=False, indent=indent)
100 |
101 | def format(self, record, serialize=True):
102 | old_message = record.getMessage()
103 | try:
104 | new_message = json.loads(old_message)
105 | except json.decoder.JSONDecodeError as e:
106 | message = old_message.replace("'", '"')
107 | new_message = json.loads(message)
108 | except Exception:
109 | new_message = record.getMessage()
110 | # Create message dict
111 | message = {
112 | 'timestamp': self.format_timestamp(record.created),
113 | 'app': os.environ.get('APP_NAME'),
114 | 'host': self.host,
115 | 'environment': os.environ.get('FLASK_ENV'),
116 | 'logger': record.name,
117 | 'level': record.levelname,
118 | 'messages': new_message,
119 | 'path': record.pathname,
120 | 'tags': self.tags[:]
121 | }
122 |
123 | # Add extra fields
124 | message.update(self.get_extra_fields(record))
125 |
126 | # Add extra tags
127 | if self.extra_tags:
128 | message['tags'].extend(self.extra_tags)
129 |
130 | # If exception, add debug info
131 | if record.exc_info or record.exc_text:
132 | message.update(self.get_debug_fields(record))
133 |
134 | if serialize:
135 | return self.serialize(message, indent=self.indent)
136 | return message
137 |
--------------------------------------------------------------------------------
/services/spider/webs/api/utils/requests.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from datetime import datetime
4 | from flask import current_app, request
5 | from sqlalchemy.exc import DatabaseError
6 | from webs.api.exceptions.customs import InvalidContentType
7 | from webs.api.models import db
8 |
9 | ACL_ORIGIN = 'Access-Control-Allow-Origin'
10 | ACL_METHODS = 'Access-Control-Allow-Methods'
11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers'
12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials'
13 | ACL_CACHE_CONTROL = 'Cache-Control'
14 |
15 | GET_METHOD = 'GET'
16 | OPTIONS_METHOD = 'OPTIONS'
17 | ALLOWED_ORIGINS = '*'
18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \
20 | 'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type'
21 | ALLOWED_CREDENTIALS = 'true' # Allow send cookie
22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate'
23 |
24 |
25 | def before_request_middleware(app):
26 | app.before_request_funcs.setdefault(None, [
27 | ensure_request_log,
28 | ensure_content_type,
29 | ])
30 |
31 |
32 | def after_request_middleware(app):
33 | app.after_request_funcs.setdefault(None, [
34 | enable_cors,
35 | commit_session,
36 | ])
37 |
38 |
39 | def teardown_appcontext_middleware(app):
40 | app.teardown_appcontext_funcs = [
41 | shutdown_session,
42 | ]
43 |
44 |
45 | def ensure_request_log():
46 | """当为生产环境时,屏蔽中间件日志记录器"""
47 | if current_app.debug:
48 | current_app.logger.info(
49 | "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || "
50 | "Parameters: {param}".format(
51 | time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
52 | client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr),
53 | path=request.full_path,
54 | param=request.data.decode('utf-8')))
55 |
56 |
57 | def ensure_content_type():
58 | """
59 | Ensures that the Content-Type for all requests
60 | is `application-json` or `multipart/form-data`, otherwise appropriate error
61 | is raised.
62 | :raises: InvalidContentType if Content-Type is not `application-json`
63 | or not `multipart/form-data`
64 | """
65 |
66 | content_type = request.headers.get('Content-Type')
67 | if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \
68 | (not content_type or not ('application/json' in content_type or
69 | 'multipart/form-data' in content_type)):
70 | raise InvalidContentType(
71 | message='Invalid Content-Type. '
72 | 'Only `application/json` or `multipart/form-data` is allowed')
73 |
74 |
75 | def enable_cors(response):
76 | """
77 | Enable Cross-origin resource sharing.
78 | These headers are needed for the clients that
79 | will consume the API via AJAX requests.
80 | """
81 | if request.method == OPTIONS_METHOD:
82 | response = current_app.make_default_options_response()
83 | response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS
84 | response.headers[ACL_METHODS] = ALLOWED_METHODS
85 | response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS
86 | response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL
87 |
88 | return response
89 |
90 |
91 | def commit_session(response):
92 | """
93 | Try to commit the db session in the case
94 | of a successful request with status_code
95 | under 400.
96 | """
97 | if response.status_code >= 400:
98 | return response
99 | try:
100 | db.session.commit()
101 | except DatabaseError:
102 | db.session.rollback()
103 | return response
104 |
105 |
106 | def shutdown_session(exception=None):
107 | """
108 | Remove the db session and detach from the
109 | database driver after application shutdown.
110 | """
111 | db.session.remove()
112 |
--------------------------------------------------------------------------------
/services/spider/webs/api/utils/routers.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pkgutil
4 |
5 |
6 | def register_routes(app):
7 | """Register routes."""
8 | from .. import views
9 | from flask.blueprints import Blueprint
10 |
11 | for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."):
12 | blueprint_name = name.split('.')[-1]
13 | modules = __import__(name, fromlist="dummy")
14 | blueprint = getattr(modules, blueprint_name)
15 | if isinstance(blueprint, Blueprint):
16 | app.register_blueprint(blueprint)
17 |
--------------------------------------------------------------------------------
/services/spider/webs/api/utils/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask_migrate import Migrate
4 |
5 | from webs.api.models import db, redis_store
6 |
7 |
8 | def init_db(app):
9 | """
10 | Create database if doesn't exist and
11 | create all tables.
12 | """
13 |
14 | # 初始化pg
15 | db.init_app(app)
16 | migrate = Migrate(compare_type=True, compare_server_default=True)
17 | migrate.init_app(app, db)
18 |
19 | # 初始化Redis
20 | redis_store.init_app(app)
21 |
22 | return db
23 |
--------------------------------------------------------------------------------
/services/spider/webs/api/views/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/webs/api/views/crawl_tasks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask import Blueprint, jsonify
4 | from webargs.flaskparser import use_args
5 |
6 | from webs.api.bizs.crawl_task import CrawlTaskBiz
7 | from webs.api.schemas.crawl_tasks import create_crawl_task_schema
8 |
9 | crawl_tasks = Blueprint('crawl_tasks', __name__, url_prefix='/crawl_tasks')
10 |
11 |
12 | @crawl_tasks.route('', methods=['POST'])
13 | @use_args(create_crawl_task_schema, locations=('json',))
14 | def create_crawl_task(args):
15 | """
16 | 创建爬虫任务
17 | :param args:
18 | :return:
19 | """
20 | crawl_task_biz = CrawlTaskBiz()
21 | data = crawl_task_biz.create_crawl_task(**args)
22 |
23 | return jsonify({
24 | 'status': True,
25 | 'data': data
26 | }), 201
27 |
--------------------------------------------------------------------------------
/services/spider/webs/api/views/ping.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from flask import Blueprint, jsonify
4 |
5 | ping = Blueprint('ping', __name__)
6 |
7 |
8 | @ping.route('/ping', methods=['GET'])
9 | def ping_pong():
10 | """
11 | 测试服务是否可用
12 | """
13 | return jsonify({
14 | "data": "pong",
15 | "status": True
16 | })
17 |
--------------------------------------------------------------------------------
/services/spider/webs/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 |
6 | class BaseConfig:
7 | """Base configuration"""
8 |
9 | # Root path of project
10 | PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
11 |
12 | DEBUG = True
13 | TESTING = False
14 | SQLALCHEMY_TRACK_MODIFICATIONS = False
15 | SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True}
16 | SECRET_KEY = os.environ.get('SECRET_KEY')
17 |
18 | # Redis configuration
19 | REDIS_URL = os.environ.get('REDIS_URL')
20 |
21 |
22 | class DevelopmentConfig(BaseConfig):
23 | """Development configuration"""
24 |
25 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
26 |
27 |
28 | class ProductionConfig(BaseConfig):
29 | """Production configuration"""
30 |
31 | DEBUG = False
32 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
33 |
--------------------------------------------------------------------------------
/services/spider/worker/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | from celery import Celery
5 |
6 | ##################
7 | # Celery配置
8 | from kombu import Queue
9 |
10 | from webs import create_app
11 |
12 |
13 | class CeleryConfig(object):
14 | # 任务与劣化为json,从Celery4.0开始,默认序列化器将为json
15 | task_serializer = 'json'
16 |
17 | # 结果序列化为json
18 | result_serializer = 'json'
19 |
20 | # 定时任务过期时间
21 | result_expires = 60 * 60 * 24
22 |
23 | # 关闭worker事件监听 防止队列溢出
24 | worker_send_task_events = False
25 |
26 | # 允许接收的任务类型
27 | accept_content = ["json"]
28 |
29 | # 每个进程预取任务数,启动参数进行覆盖设置,此处仅作为标记使用
30 | worker_prefetch_multiplier = 4
31 |
32 | # 每个worker执行1个任务就销毁重启,启动参数进行覆盖设置,此处仅作为标记使用
33 | worker_max_tasks_per_child = 1
34 |
35 | # 时区设置
36 | timezone = 'Asia/Shanghai'
37 | enable_utc = True
38 |
39 |
40 | ##################
41 | # 初始化celery worker
42 | def init_celery(app=None, celery_type='usual'):
43 | app = app or create_app()
44 | celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL'))
45 | celery_app.config_from_object(CeleryConfig)
46 |
47 | # 导入相关任务模块
48 | if celery_type == 'usual':
49 | celery_app.conf.update(imports=['worker.fetch', 'worker.results'])
50 | celery_app.conf.task_queues = (
51 | Queue("priority_fetch", queue_arguments={'x-max-priority': 5}),
52 | Queue("results"),
53 | )
54 | elif celery_type == 'beat':
55 | pass
56 | # celery_app.conf.update(
57 | # imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron'])
58 | # celery_app.conf.update(
59 | # CELERYBEAT_SCHEDULE={
60 | # }
61 | # )
62 |
63 | # 在flask上下文中执行
64 | class ContextTask(celery_app.Task):
65 | """Make celery tasks work with Flask app context"""
66 |
67 | def __call__(self, *args, **kwargs):
68 | with app.app_context():
69 | return self.run(*args, **kwargs)
70 |
71 | celery_app.Task = ContextTask
72 | return celery_app
73 |
74 |
75 | celery_app = init_celery()
76 | # beat_app = init_celery(celery_type='beat')
77 |
--------------------------------------------------------------------------------
/services/spider/worker/fetch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from datetime import datetime
4 |
5 | from billiard.exceptions import SoftTimeLimitExceeded
6 |
7 | from rpc.client.result import ResultClient
8 | from webs.api.models.db_proxy import crawl_task_model_proxy, result_model_proxy
9 | from worker import celery_app
10 | from worker.library.playwright import PlayWrightHandler
11 |
12 |
13 | @celery_app.task(
14 | name='fetch_tasks', queue='priority_fetch', acks_late=True, soft_time_limit=1000, max_retries=1,
15 | default_retry_delay=30, autoretry_for=(Exception,))
16 | def fetch_tasks(crawl_task_id):
17 | """
18 | 通过优先级队列取得任务进行抓取
19 | """
20 |
21 | crawl_task_obj = crawl_task_model_proxy.find_one_with_condition(
22 | crawl_task_model_proxy.model.id == crawl_task_id,
23 | crawl_task_model_proxy.model.process_state != 'finished'
24 | )
25 | if not crawl_task_obj:
26 | return
27 |
28 | # 设置爬取任务开始
29 | if crawl_task_obj.process_state == 'readying':
30 | crawl_task_model_proxy.set_attr(crawl_task_obj, 'process_state', 'running')
31 | url_nested_list = crawl_task_obj.url_nested_list
32 |
33 | # 导致此情况原因为worker进程异常退出,rabbitmq未确认此消息,worker重启此任务再次被投递
34 | else: # crawl_task_obj.process_state == 'running'
35 | already_url_ids = result_model_proxy.query_already_crawl_url_ids(subtask_id=crawl_task_obj.subtask_id)
36 | url_nested_list = [
37 | url_info for url_info in crawl_task_obj.url_nested_list
38 | if url_info['url_id'] not in already_url_ids
39 | ]
40 | undone_url_ids = []
41 | if url_nested_list:
42 | # 执行抓取
43 | playwright_handler = PlayWrightHandler(
44 | subtask_id=crawl_task_obj.subtask_id,
45 | url_nested_list=url_nested_list,
46 | options=crawl_task_obj.options)
47 | undone_url_ids = playwright_handler.run()
48 |
49 | # 设置爬取状态、结束时间、抓取失败的urls
50 | crawl_task_model_proxy.set_many_attr(
51 | obj=crawl_task_obj,
52 | fields_v={
53 | 'process_state': 'finished',
54 | 'finished_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
55 | 'failure_url_ids': undone_url_ids
56 | }
57 | )
58 |
59 | ####### 调用engine端rpc服务设置subtask爬取状态
60 | # 连接grpc服务
61 | grpc_result_client = ResultClient(crawl_task_obj.options.get('rpc_server'))
62 |
63 | # 设置Subtask爬取状态
64 | grpc_result_client.set_subtask_status(
65 | subtask_id=crawl_task_obj.subtask_id, status=True, finished_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
66 |
--------------------------------------------------------------------------------
/services/spider/worker/library/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/services/spider/worker/library/helper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | from typing import Optional
5 |
6 | if sys.version_info >= (3, 8):
7 | from typing import TypedDict # pylint: disable=no-name-in-module
8 | else:
9 | from typing_extensions import TypedDict
10 |
11 |
12 | class ProxyServer(TypedDict):
13 | server: str
14 | bypass: Optional[str]
15 | username: Optional[str]
16 | password: Optional[str]
17 |
18 |
19 | class RecordHarOptions(TypedDict):
20 | omitContent: Optional[bool]
21 | path: str
22 |
--------------------------------------------------------------------------------
/services/spider/worker/results.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import hashlib
3 | import json
4 | import os
5 | import time
6 |
7 | from rpc.client.result import ResultClient
8 | from webs.api.models.db_proxy import result_model_proxy
9 | from worker import celery_app
10 |
11 |
12 | @celery_app.task(name='save_results', queue='results')
13 | def save_results(subtask_id, url_id, url_address, rpc_server, **kwargs):
14 | """
15 | 保存爬取结果
16 | :param subtask_id:
17 | :param url_id:
18 | :param url_address:
19 | :param rpc_server:
20 | :param kwargs:
21 | :return:
22 | """
23 |
24 | http_archive_dict = kwargs.pop('http_archive_dict')
25 |
26 | # 保存爬取结果,仅作为容灾备份使用
27 | result_model_proxy.create(subtask_id, url_id, url_address)
28 |
29 | # 连接grpc服务
30 | grpc_result_client = ResultClient(rpc_server)
31 |
32 | # 反馈截图
33 | if kwargs.get('screenshot_id') \
34 | and os.path.exists('/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id'])):
35 | img_path = '/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id'])
36 | try:
37 | with open(img_path, 'rb') as f:
38 | md5 = hashlib.md5()
39 | while True:
40 | fb = f.read(8096)
41 | if not fb:
42 | break
43 | md5.update(fb)
44 | screenshot_md5 = md5.hexdigest()
45 | os.rename(img_path, f'/usr/src/app/screenshots/{screenshot_md5}.png')
46 | kwargs['screenshot_id'] = screenshot_md5
47 | grpc_result_client.upload_screenshot(screenshot_name=f'{screenshot_md5}.png')
48 | except Exception as e:
49 | pass
50 |
51 | # 向engine反馈基本爬取数据
52 | grpc_result_client.save_base_result(subtask_id, url_id, url_address, **kwargs)
53 |
54 | # 反馈har文件
55 | if kwargs.get('har_uuid') and http_archive_dict.get('hars'):
56 | with open('/usr/src/app/hars/{}.json'.format(kwargs['har_uuid']), 'w+', encoding='utf-8') as f:
57 | f.write(json.dumps(http_archive_dict, ensure_ascii=False, indent=2))
58 | grpc_result_client.upload_har_file(har_file_name='{}.json'.format(kwargs['har_uuid']))
59 |
--------------------------------------------------------------------------------
/spider.docker-conpose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 |
3 | services:
4 | spider-client:
5 | container_name: spider-client
6 | build:
7 | context: ./services/spider
8 | dockerfile: Dockerfile-dev
9 | volumes:
10 | - './services/spider:/usr/src/app'
11 | ports:
12 | - 15001:5000
13 | env_file:
14 | - spider.env
15 | environment:
16 | - FLASK_APP=webs:create_app
17 | - ENDPOINT=web
18 | restart: always
19 |
20 | spider-fetch:
21 | container_name: spider-fetch
22 | build:
23 | context: ./services/spider
24 | dockerfile: Dockerfile-dev
25 | volumes:
26 | - './services/spider:/usr/src/app'
27 | env_file:
28 | - spider.env
29 | environment:
30 | - ENDPOINT=fetch
31 | restart: always
32 |
33 |
34 | save-results:
35 | container_name: save-results
36 | build:
37 | context: ./services/spider
38 | dockerfile: Dockerfile-dev
39 | volumes:
40 | - './services/spider:/usr/src/app'
41 | env_file:
42 | - spider.env
43 | environment:
44 | - ENDPOINT=results
45 | restart: always
--------------------------------------------------------------------------------
/架构图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/架构图.png
--------------------------------------------------------------------------------