├── .gitignore ├── .idea ├── .gitignore ├── crawloop.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── engine.docker-compose.yml ├── image_build ├── msyh.ttf ├── playwright-xvfb └── sources.list ├── services ├── engine │ ├── .dockerignore │ ├── Dockerfile-dev │ ├── aps │ │ ├── __init__.py │ │ ├── func.py │ │ ├── logger.py │ │ └── server.py │ ├── apserver.py │ ├── build.sh │ ├── build_sentry_ini.py │ ├── compile.py │ ├── entrypoint.sh │ ├── grpcserver.py │ ├── gunicorn_config.py │ ├── gunicorn_logging.ini │ ├── manage.py │ ├── migrations │ │ ├── README │ │ ├── alembic.ini │ │ ├── env.py │ │ ├── script.py.mako │ │ └── versions │ │ │ ├── 1569921cac58_加入响应时间和渲染时间.py │ │ │ ├── 4a243739ef84_初始化.py │ │ │ ├── 5b189e0161ee_加入网站编码.py │ │ │ ├── 71bf761944f8_新增网站图标字段.py │ │ │ └── b3bd5bc9e4e3_增加extra客户端额外数据.py │ ├── requirements.txt │ ├── rpc │ │ ├── __init__.py │ │ ├── client │ │ │ ├── __init__.py │ │ │ └── callback_client.py │ │ ├── codegen.sh │ │ ├── pb │ │ │ ├── __init__.py │ │ │ ├── callback_pb2.py │ │ │ ├── callback_pb2_grpc.py │ │ │ ├── result_pb2.py │ │ │ └── result_pb2_grpc.py │ │ ├── protos │ │ │ └── result.proto │ │ └── server │ │ │ ├── __init__.py │ │ │ └── result.py │ ├── sources.list │ ├── wappalyzer │ │ ├── __init__.py │ │ ├── data.json │ │ ├── helper.py │ │ ├── modelcalss.py │ │ └── wappalyzerhandler.py │ ├── webs │ │ ├── __init__.py │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── bizs │ │ │ │ ├── __init__.py │ │ │ │ ├── result.py │ │ │ │ └── task.py │ │ │ ├── exceptions │ │ │ │ ├── __init__.py │ │ │ │ └── customs.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── apscheduler_job.py │ │ │ │ ├── base_model.py │ │ │ │ ├── db_proxy │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── apschedule.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── result.py │ │ │ │ │ ├── schedule_task.py │ │ │ │ │ ├── server.py │ │ │ │ │ ├── subtask.py │ │ │ │ │ ├── task.py │ │ │ │ │ ├── task_url.py │ │ │ │ │ └── url.py │ │ │ │ ├── result.py │ │ │ │ ├── server.py │ │ │ │ ├── task.py │ │ │ │ ├── task_url.py │ │ │ │ └── url.py │ │ │ ├── schemas │ │ │ │ ├── __init__.py │ │ │ │ ├── results.py │ │ │ │ └── tasks.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── helper.py │ │ │ │ ├── loggers.py │ │ │ │ ├── requests.py │ │ │ │ ├── responses.py │ │ │ │ ├── routers.py │ │ │ │ └── settings.py │ │ │ └── views │ │ │ │ ├── __init__.py │ │ │ │ ├── ping.py │ │ │ │ ├── results.py │ │ │ │ └── tasks.py │ │ ├── config.py │ │ └── core │ │ │ ├── __init__.py │ │ │ └── requests │ │ │ ├── __init__.py │ │ │ └── request.py │ └── worker │ │ ├── __init__.py │ │ ├── engine.py │ │ ├── library │ │ ├── __init__.py │ │ ├── favicon.py │ │ └── helper.py │ │ └── result.py └── spider │ ├── .dockerignore │ ├── Dockerfile-dev │ ├── Dockerfile-prod │ ├── build.sh │ ├── build_sentry_ini.py │ ├── compile.py │ ├── entrypoint.sh │ ├── gunicorn_config.py │ ├── gunicorn_logging.ini │ ├── manage.py │ ├── migrations │ ├── README │ ├── alembic.ini │ ├── env.py │ ├── script.py.mako │ └── versions │ │ ├── 81a88acb3641_记录cookies.py │ │ └── 8efa2b9dcc87_init.py │ ├── requirements.txt │ ├── rpc │ ├── __init__.py │ ├── client │ │ ├── __init__.py │ │ └── result.py │ └── pb │ │ ├── __init__.py │ │ ├── result_pb2.py │ │ └── result_pb2_grpc.py │ ├── sources.list │ ├── webs │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── bizs │ │ │ ├── __init__.py │ │ │ └── crawl_task.py │ │ ├── exceptions │ │ │ ├── __init__.py │ │ │ └── customs.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── crawl_task.py │ │ │ ├── db_proxy │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── crawl_task.py │ │ │ │ └── result.py │ │ │ └── result.py │ │ ├── schemas │ │ │ ├── __init__.py │ │ │ └── crawl_tasks.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── loggers.py │ │ │ ├── requests.py │ │ │ ├── responses.py │ │ │ ├── routers.py │ │ │ └── settings.py │ │ └── views │ │ │ ├── __init__.py │ │ │ ├── crawl_tasks.py │ │ │ └── ping.py │ └── config.py │ └── worker │ ├── __init__.py │ ├── fetch.py │ ├── library │ ├── __init__.py │ ├── helper.py │ └── playwright.py │ └── results.py ├── spider.docker-conpose.yml └── 架构图.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/.idea/.gitignore -------------------------------------------------------------------------------- /.idea/crawloop.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 1625148082549 51 | 57 | 58 | 59 | 60 | 62 | 63 | 72 | 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 用go简单实现了一版playwright爬虫器,发现性能不是python版能比的,后面又用go实现了[网站目录文件爆破](https://github.com/who0sy/WebsiteFuzz)简单版本,跑起来速度简直太爽了。后面有时间会用继续用go重构此项目,python版的不再维护。 3 | ------------------------------------------------------------------------------------------------------------ 4 | ------------------------------------------------------------------------------------------------------------- 5 | 6 | 7 | 8 | # crawloop 9 | 基于PlayWright实现对js渲染的动态网页进行抓取,包含网页源码、截图、网站入口、网页交互过程等,支持优先级任务调度。 10 | 11 | crawloop 目前支持以下特性: 12 | - 原生浏览器环境,支持chrome、firefox,并支持开启容器内虚拟显示器; 13 | - 完整DOM事件收集,自动化触发; 14 | - 全面分析收集,包括js文件,页面源码、网站截图、网站图标、标题、编码、cookies、重定向链等等; 15 | - 基于Wappalyzer实现python版网站指纹识别,主要包含网站使用技术、技术版本号、置信度等; 16 | - 支持Host绑定,可自定义添加Referer; 17 | - 支持请求代理,支持爬虫结果通过http或gRPC主动回调; 18 | - 任务进度监控:支持实时监控任务进度; 19 | - 自定义任务参数:支持在线配置调度任务入参,即时生效; 20 | - 调度中心HA(中心式):调度采用中心式设计,“调度中心”自研调度组件并支持集群部署,可保证调度中心HA; 21 | - 爬虫执行器HA(分布式):任务分布式执行,任务"爬虫执行器"支持集群部署,可保证爬虫任务执行HA; 22 | - 弹性扩容缩容:一旦有新的爬虫执行器机器上线或者下线,下次调度时将会重新分配任务; 23 | - 触发策略:提供丰富的任务触发策略,包括:Cron触发、固定间隔触发、固定延时触发、API(事件)触发、人工触发、父子任务触发; 24 | - 阻塞处理策略:调度过于密集爬虫执行器来不及处理时的处理策略,策略包括:单机串行(默认)、丢弃后续调度、覆盖之前调度; 25 | - 任务超时控制:支持自定义任务超时时间,任务运行超时将会主动中断任务; 26 | - 任务失败重试:支持自定义任务失败重试次数,当任务失败时将会按照预设的失败重试次数主动进行重试;其中分片任务支持分片粒度的失败重试;并支持断点续爬; 27 | - 路由策略:爬虫执行器集群部署时提供丰富的路由策略,包括:第一个(已实现)、最后一个(已实现)、轮询(已实现)、加权轮询(已实现)、一致性HASH(待实现)等; 28 | - 动态分片:分片广播任务以任务数量为维度进行分片(默认为100条url为一个分片),以分片为单位下发不同爬虫执行器,协同进行业务处理;在进行大数据量爬虫任务操作时可显著提升任务处理能力和速度; 29 | - 调度线程池:调度系统多线程触发调度运行,确保调度精确执行,不被堵塞; 30 | - 全异步:任务调度流程全异步化设计实现,如异步调度、异步运行、异步回调等,有效对密集调度进行流量削峰,理论上支持任意时长任务的运行; 31 | - 跨语言:调度中心与爬虫执行器提供语言无关的 RESTful API 服务,第三方任意语言可据此对接调度中心或者实现自定义爬虫执行器; 32 | - 任务优先级控制:爬虫执行器实现优先级队列,可对不同优先级任务进行隔离拆分,慢任务或权重较低任务自动降级进入"Slow"队列,避免耗尽爬虫执行器,提高系统灵活性; 33 | - 容器化:项目编译在容器内进行,进一步实现功能开箱即用; 34 | 35 | 36 | ### 环境(Docker) 37 | - Docker 18.03+ 38 | - Postgresl 9.x+ 39 | - Rabbitmq 3.8.x+ 40 | - Docker Compose 1.24+ 41 | 42 | 43 | ## 架构 44 | 45 | Crawloop的架构包括了一个主节点(Master Node)和多个工作节点(Worker Node),以及负责通信和数据储存的gRPC和Postgresql数据库。 46 | 47 | ![](架构图.png) 48 | 49 | 客户端应用向主节点请求数据,主节点通过Celery和Rabbitmq来执行任务派发调度以及负载均衡,工作节点收到任务之后,开始执行爬虫任务,并将任务结果通过gRPC回调给主节点,之后落库存储。 50 | 51 | 主节点是整个Crawloop架构的核心,属于Crawloop的中控系统。 52 | 53 | 主节点主要负责以下功能: 54 | 1. 周期性任务调度 55 | 2. 工作节点管理和通信 56 | 3. 对外API服务 57 | 58 | 主节点负责与客户端进行通信,并通过Celery将爬虫任务基于负载均衡算法异步派发给工作节点。 59 | 60 | ### 工作节点 61 | 62 | 工作节点的主要功能是执行爬虫任务和回调抓取数据与日志,并且通过gRPC跟主节点通信。通过增加工作节点数量,Crawloop可以做到横向扩展,不同的爬虫任务可以分配到不同的节点上执行。 63 | -------------------------------------------------------------------------------- /engine.docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | engine: 5 | container_name: engine 6 | build: 7 | context: ./services/engine 8 | dockerfile: Dockerfile-dev 9 | volumes: 10 | - './services/engine:/usr/src/app' 11 | ports: 12 | - 15000:5000 13 | env_file: 14 | - engine.env 15 | environment: 16 | - FLASK_APP=webs:create_app 17 | - ENDPOINT=web 18 | restart: always 19 | 20 | engine-worker: 21 | container_name: engine-worker 22 | build: 23 | context: ./services/engine 24 | dockerfile: Dockerfile-dev 25 | volumes: 26 | - './services/engine:/usr/src/app' 27 | env_file: 28 | - engine.env 29 | environment: 30 | - ENDPOINT=engine-worker 31 | restart: always 32 | 33 | engine-grpc: 34 | container_name: engine-grpc 35 | build: 36 | context: ./services/engine 37 | dockerfile: Dockerfile-dev 38 | volumes: 39 | - './services/engine:/usr/src/app' 40 | ports: 41 | - 15002:15002 42 | env_file: 43 | - engine.env 44 | environment: 45 | - ENDPOINT=engine-grpc 46 | restart: always 47 | 48 | engine-apscheduler: 49 | container_name: engine-apscheduler 50 | build: 51 | context: ./services/engine 52 | dockerfile: Dockerfile-dev 53 | volumes: 54 | - './services/engine:/usr/src/app' 55 | ports: 56 | - 15003:15003 57 | env_file: 58 | - engine.env 59 | environment: 60 | - ENDPOINT=apscheduler 61 | restart: always -------------------------------------------------------------------------------- /image_build/msyh.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/image_build/msyh.ttf -------------------------------------------------------------------------------- /image_build/playwright-xvfb: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1 4 | 5 | # 替换默认源 6 | COPY ./sources.list /etc/apt/ 7 | 8 | # 安装依赖 9 | RUN apt-get update && apt-get -y install libnss3 xvfb gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 \ 10 | libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 \ 11 | libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 \ 12 | libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 \ 13 | libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget 14 | 15 | # 设置时区 16 | ENV TZ=Asia/Shanghai 17 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 18 | 19 | # 设置默认字体库 20 | COPY ./msyh.ttf /usr/share/fonts/ 21 | RUN fc-cache -fv 22 | 23 | # 开启xvfb虚拟显示器 24 | RUN Xvfb -screen 0 1020x720x16 :99 & 25 | RUN export DISPLAY=:99 26 | ENV DISPLAY=:99 27 | 28 | # 安装playwright驱动 29 | RUN pip install playwright==0.162.2 -i https://pypi.douban.com/simple 30 | RUN python -m playwright install 31 | -------------------------------------------------------------------------------- /image_build/sources.list: -------------------------------------------------------------------------------- 1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free 2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free 3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free 6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free -------------------------------------------------------------------------------- /services/engine/.dockerignore: -------------------------------------------------------------------------------- 1 | env 2 | *.env 3 | .dockerignore 4 | Dockerfile-dev 5 | Dockerfile-prod 6 | htmlcov 7 | celerybeat-shcedule 8 | scheduler.lock 9 | celerybeat.pid 10 | 11 | -------------------------------------------------------------------------------- /services/engine/Dockerfile-dev: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM python:3.7-slim 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1 4 | 5 | # update apt source 6 | COPY ./sources.list /etc/apt/ 7 | 8 | # install netcat 9 | RUN apt-get update && \ 10 | apt-get -y install netcat && \ 11 | apt-get clean 12 | 13 | # set timezone 14 | ENV TZ=Asia/Shanghai 15 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 16 | 17 | # set working directory 18 | RUN mkdir -p /usr/src/app 19 | WORKDIR /usr/src/app 20 | 21 | # add and install requirements 22 | COPY ./requirements.txt /usr/src/app/requirements.txt 23 | RUN pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ && \ 24 | pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ 25 | 26 | # add app 27 | COPY . /usr/src/app 28 | 29 | # run server 30 | CMD ["/usr/src/app/entrypoint.sh"] 31 | -------------------------------------------------------------------------------- /services/engine/aps/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /services/engine/aps/func.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from aps.logger import scheduler_logger 4 | from webs.api.models.db_proxy import task_model_proxy, schedule_task_proxy 5 | from manage import app 6 | 7 | 8 | def send_task_func(task_id): 9 | """下发爬取任务""" 10 | with app.app_context(): 11 | scheduler_logger.info(f'开始调度任务:【task-id:{task_id}】') 12 | task_obj = task_model_proxy.find(id=task_id) 13 | if not task_obj: 14 | scheduler_logger.info(f'该任务已完成或已删除!:【task-id:{task_id}】') 15 | return 16 | 17 | # 判断当前主任务下的调度任务是否大于最大实例数 18 | running_schedule_tasks = schedule_task_proxy.query_running_schedule_tasks(task_id) 19 | max_instances = task_obj.schedule_options.get('schedule_data', {}).get('max_instances', 1) 20 | if len(running_schedule_tasks) >= max_instances: 21 | scheduler_logger.info(f'该任务已超过最大实例数,此次调度已忽略!:【task-id:{task_id}】') 22 | return 23 | 24 | # 异步切割任务下发 25 | from worker import celery_app 26 | celery_app.send_task( 27 | name='delivery_task', queue='engine', 28 | kwargs={'task_id': task_id} 29 | ) 30 | -------------------------------------------------------------------------------- /services/engine/aps/logger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | scheduler_logger = logging.getLogger('scheduler') 6 | stream_handler = logging.StreamHandler() 7 | fmt = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s") 8 | stream_handler.setFormatter(fmt) 9 | scheduler_logger.addHandler(stream_handler) # 输出到终端 10 | scheduler_logger.setLevel(logging.INFO) 11 | -------------------------------------------------------------------------------- /services/engine/aps/server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | import rpyc 6 | from apscheduler.jobstores.base import JobLookupError 7 | from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore 8 | from apscheduler.schedulers import SchedulerAlreadyRunningError, SchedulerNotRunningError 9 | from apscheduler.schedulers.background import BackgroundScheduler 10 | 11 | from aps.func import send_task_func 12 | from aps.logger import scheduler_logger 13 | 14 | 15 | ################## 16 | # APScheduler配置 17 | 18 | 19 | class APSchedulerConfig(object): 20 | # 时区 21 | timezone = 'Asia/Shanghai' 22 | 23 | # 后端存储器 24 | jobstores = { 25 | 'default': SQLAlchemyJobStore(url=os.getenv('DATABASE_URL'), tablename='apscheduler_jobs') 26 | } 27 | 28 | # 执行器 29 | executors = { 30 | 'default': {'type': 'threadpool', 'max_workers': 10} 31 | } 32 | 33 | # 调度器设置 34 | job_defaults = { 35 | 'coalesce': True, # 是否启用合并运行(在几个运行时间同时到期时只运行一次) 36 | 'misfire_grace_time': 3600, # 任务的执行时间可以延迟多少秒 用于任务时间到达到时,执行器未启动下次重启时任务可以延迟时间 37 | 'max_instances': 1 # 最大实例数 38 | } 39 | 40 | 41 | ################## 42 | # APScheduler调度器 43 | class APSchedulerService(rpyc.Service): 44 | @staticmethod 45 | def start(): 46 | try: 47 | apscheduler.start(paused=False) 48 | scheduler_logger.info('Started APScheduler Success!') 49 | except SchedulerAlreadyRunningError: 50 | scheduler_logger.info('APScheduler Already Running!') 51 | 52 | @staticmethod 53 | def shutdown(): 54 | try: 55 | apscheduler.shutdown() 56 | except SchedulerNotRunningError: 57 | scheduler_logger.info('Scheduler has been shut down!') 58 | 59 | @staticmethod 60 | def exposed_add_task(task_id, schedule_type, schedule_data): 61 | """ 62 | 添加调度任务 63 | :param task_id: 64 | :param schedule_type: 65 | :param schedule_data: 66 | :return: 67 | """ 68 | trigger_map = {'instantly': None, 'datetime': 'date'} 69 | apscheduler.add_job( 70 | func=send_task_func, id=str(task_id), kwargs={'task_id': task_id}, 71 | trigger=trigger_map.get(schedule_type, schedule_type), 72 | **schedule_data 73 | ) 74 | 75 | @staticmethod 76 | def exposed_delete_task(task_id, jobstore=None): 77 | """ 78 | 删除调度任务 79 | :param task_id: 80 | :param jobstore: 81 | :return: 82 | """ 83 | try: 84 | apscheduler.remove_job(job_id=str(task_id), jobstore=jobstore) 85 | except JobLookupError: 86 | scheduler_logger.warning('Job was not found or this job has ended!') 87 | 88 | @staticmethod 89 | def exposed_pause_task(task_id, jobstore=None): 90 | """ 91 | 暂停调度任务 92 | :param task_id: 93 | :param jobstore: 94 | :return: 95 | """ 96 | 97 | try: 98 | apscheduler.pause_job(job_id=str(task_id), jobstore=jobstore) 99 | except JobLookupError: 100 | scheduler_logger.warning('Job was not found or this job has ended!') 101 | 102 | @staticmethod 103 | def exposed_resume_task(task_id, jobstore=None): 104 | """ 105 | 恢复调度任务 106 | :param task_id: 107 | :param jobstore: 108 | :return: 109 | """ 110 | 111 | try: 112 | apscheduler.resume_job(job_id=str(task_id), jobstore=jobstore) 113 | except JobLookupError: 114 | scheduler_logger.warning('Job was not found or this job has ended!') 115 | 116 | 117 | ###### 创建APScheduler 118 | apscheduler = BackgroundScheduler( 119 | jobstores=APSchedulerConfig.jobstores, executors=APSchedulerConfig.executors, 120 | job_defaults=APSchedulerConfig.job_defaults, timezone=APSchedulerConfig.timezone) 121 | 122 | ###### 创建APScheduler调度对象,供业务方调用 123 | apscheduler_server = APSchedulerService() 124 | -------------------------------------------------------------------------------- /services/engine/apserver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from rpyc import ThreadedServer 5 | 6 | from aps.server import apscheduler_server, APSchedulerService 7 | 8 | 9 | def runserver(): 10 | """运行APSchedule RPC服务""" 11 | 12 | # 在后台运行APS 13 | apscheduler_server.start() 14 | 15 | # 启动RPC承载APScheduler 16 | server = ThreadedServer( 17 | APSchedulerService, port=15003, 18 | protocol_config={'allow_public_attrs': True, 'allow_pickle': True}) 19 | 20 | # 启动RPC服务 21 | try: 22 | server.start() 23 | except (KeyboardInterrupt, SystemExit): 24 | pass 25 | finally: 26 | apscheduler_server.shutdown() 27 | 28 | 29 | if __name__ == '__main__': 30 | runserver() 31 | -------------------------------------------------------------------------------- /services/engine/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 清除缓存目录 4 | find . -type d -name __pycache__ | xargs rm -rf 5 | 6 | # 编译代码 7 | python3 compile.py build_ext --inplace 8 | if [ $? -ne 0 ]; then 9 | exit 1 10 | fi 11 | 12 | # 将.so文件改名 13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 16 | 17 | # 删除.py文件 18 | find ./rpc -name '*.py' | xargs rm -f 19 | find ./webs -name '*.py' | xargs rm -f 20 | find ./worker -name '*.py' | xargs rm -f 21 | 22 | # 清除不需要的文件 23 | rm -rf build 24 | rm -f .gitignore 25 | rm -f compile.py 26 | rm -f build.sh 27 | -------------------------------------------------------------------------------- /services/engine/build_sentry_ini.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | 6 | from configobj import ConfigObj 7 | 8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8') 9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False) 10 | log_ini.write() 11 | -------------------------------------------------------------------------------- /services/engine/compile.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Cython.Build import cythonize 4 | from Cython.Distutils import build_ext 5 | from setuptools import setup 6 | from setuptools.extension import Extension 7 | 8 | setup( 9 | ext_modules=cythonize( 10 | [ 11 | Extension('rpc.*', ['rpc/*.py']), 12 | Extension('rpc.client.*', ['rpc/client/*.py']), 13 | Extension('rpc.pb.*', ['rpc/pb/*.py']), 14 | Extension('rpc.server.*', ['rpc/server/*.py']), 15 | Extension('webs.*', ['webs/*.py']), 16 | Extension('webs.api.*', ['webs/api/*.py']), 17 | Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']), 18 | Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']), 19 | Extension('webs.api.models*', ['webs/api/models/*.py']), 20 | Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']), 21 | Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']), 22 | Extension('webs.api.utils.*', ['webs/api/utils/*.py']), 23 | Extension('webs.api.views.*', ['webs/api/views/*.py']), 24 | Extension('webs.core.*', ['webs/core/*.py']), 25 | Extension('webs.core.requests.*', ['webs/core/requests/*.py']), 26 | Extension('worker.*', ['worker/*.py']), 27 | Extension('worker.library.*', ['worker/library/*.py']) 28 | ], 29 | build_dir='build', 30 | compiler_directives=dict( 31 | always_allow_keywords=True, language_level=3 32 | ) 33 | ), 34 | cmdclass=dict( 35 | build_ext=build_ext 36 | ) 37 | ) 38 | -------------------------------------------------------------------------------- /services/engine/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # web服务 4 | if [ "$ENDPOINT" = "web" ]; then 5 | # 开发环境 6 | if [ "$FLASK_ENV" = "development" ]; then 7 | flask run -h 0.0.0.0 -p 5000 8 | 9 | # 生产环境 10 | elif [ "$FLASK_ENV" = "production" ]; then 11 | 12 | # 注册sentry 13 | python build_sentry_ini.py 14 | 15 | # 使用gunicorn承载flask服务 16 | gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app 17 | fi 18 | 19 | # grpc服务端 20 | elif [ "$ENDPOINT" = "engine-grpc" ]; then 21 | python grpcserver.py 22 | 23 | # Apscheduler 24 | elif [ "$ENDPOINT" = "apscheduler" ]; then 25 | python apserver.py 26 | 27 | # worker 28 | elif [ "$ENDPOINT" = "engine-worker" ]; then 29 | # celery -A worker.celery_app worker -Q engine,base_result -l info -c 5 -n worker_engine@%h 30 | celery -A worker.celery_app worker -Q engine,base_result -l info --pool=prefork --concurrency=10 --prefetch-multiplier 4 --without-heartbeat -n worker_engine@%h 31 | fi 32 | -------------------------------------------------------------------------------- /services/engine/grpcserver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import time 5 | from concurrent import futures 6 | 7 | import grpc 8 | 9 | from rpc.pb import result_pb2_grpc 10 | from rpc.server.result import ResultServicer 11 | 12 | 13 | # 运行函数 14 | def run(): 15 | # 以线程池运行rpc服务 16 | server = grpc.server( 17 | futures.ThreadPoolExecutor(max_workers=os.getenv('GRPC_SERVER_MAX_WORKER_COUNT', 10)), 18 | options=[ 19 | ( 20 | 'grpc.max_send_message_length', 21 | os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200) * 1024 * 1024 22 | ), 23 | ( 24 | 'grpc.max_receive_message_length', 25 | os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200) * 1024 * 1024 26 | ), 27 | ] 28 | ) 29 | 30 | ###### 注册服务 31 | # 保存爬虫基本信息 32 | result_pb2_grpc.add_ResultServicer_to_server(ResultServicer(), server) 33 | 34 | # 设置服务器监听地址 35 | server.add_insecure_port(address='0.0.0.0:15002') 36 | 37 | # 启动服务 38 | server.start() 39 | 40 | # 阻塞rpc服务 41 | try: 42 | while True: 43 | time.sleep(60 * 60 * 24) 44 | except KeyboardInterrupt: 45 | server.stop(0) 46 | 47 | 48 | if __name__ == '__main__': 49 | run() 50 | -------------------------------------------------------------------------------- /services/engine/gunicorn_config.py: -------------------------------------------------------------------------------- 1 | # Sample Gunicorn configuration file. 2 | 3 | import multiprocessing as mlp 4 | 5 | # 解决无限递归 6 | import os 7 | 8 | import gevent.monkey 9 | 10 | gevent.monkey.patch_all() 11 | 12 | # 13 | # Server socket 14 | # 15 | # bind - The socket to bind. 16 | # 17 | # A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'. 18 | # An IP is a valid HOST. 19 | # 20 | # backlog - The number of pending connections. This refers 21 | # to the number of clients that can be waiting to be 22 | # served. Exceeding this number results in the client 23 | # getting an error when attempting to connect. It should 24 | # only affect servers under significant load. 25 | # 26 | # Must be a positive integer. Generally set in the 64-2048 27 | # range. 28 | # 29 | 30 | bind = '0.0.0.0:5000' 31 | backlog = 2048 32 | 33 | # 34 | # Worker processes 35 | # 36 | # workers - The number of worker processes that this server 37 | # should keep alive for handling requests. 38 | # 39 | # A positive integer generally in the 2-4 x $(NUM_CORES) 40 | # range. You'll want to vary this a bit to find the best 41 | # for your particular application's work load. 42 | # 43 | # worker_class - The type of workers to use. The default 44 | # sync class should handle most 'normal' types of work 45 | # loads. You'll want to read 46 | # http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type 47 | # for information on when you might want to choose one 48 | # of the other worker classes. 49 | # 50 | # A string referring to a Python path to a subclass of 51 | # gunicorn.workers.base.Worker. The default provided values 52 | # can be seen at 53 | # http://docs.gunicorn.org/en/latest/settings.html#worker-class 54 | # 55 | # worker_connections - For the eventlet and gevent worker classes 56 | # this limits the maximum number of simultaneous clients that 57 | # a single process can handle. 58 | # 59 | # A positive integer generally set to around 1000. 60 | # 61 | # timeout - If a worker does not notify the master process in this 62 | # number of seconds it is killed and a new worker is spawned 63 | # to replace it. 64 | # 65 | # Generally set to thirty seconds. Only set this noticeably 66 | # higher if you're sure of the repercussions for sync workers. 67 | # For the non sync workers it just means that the worker 68 | # process is still communicating and is not tied to the length 69 | # of time required to handle a single request. 70 | # 71 | # keepalive - The number of seconds to wait for the next request 72 | # on a Keep-Alive HTTP connection. 73 | # 74 | # A positive integer. Generally set in the 1-5 seconds range. 75 | # 76 | 77 | # Number of processes 78 | workers = mlp.cpu_count() * 2 + 1 79 | 80 | # Threads 81 | threads = mlp.cpu_count() * 2 82 | 83 | worker_class = 'gevent' 84 | worker_connections = 1000 85 | timeout = os.getenv('GUNICORN_TIMEOUT', 180) 86 | keepalive = 2 87 | 88 | # 89 | # spew - Install a trace function that spews every line of Python 90 | # that is executed when running the server. This is the 91 | # nuclear option. 92 | # 93 | # True or False 94 | # 95 | 96 | spew = False 97 | 98 | # 99 | # Server mechanics 100 | # 101 | # daemon - Detach the main Gunicorn process from the controlling 102 | # terminal with a standard fork/fork sequence. 103 | # 104 | # True or False 105 | # 106 | # raw_env - Pass environment variables to the execution environment. 107 | # 108 | # pidfile - The path to a pid file to write 109 | # 110 | # A path string or None to not write a pid file. 111 | # 112 | # user - Switch worker processes to run as this user. 113 | # 114 | # A valid user id (as an integer) or the name of a user that 115 | # can be retrieved with a call to pwd.getpwnam(value) or None 116 | # to not change the worker process user. 117 | # 118 | # group - Switch worker process to run as this group. 119 | # 120 | # A valid group id (as an integer) or the name of a user that 121 | # can be retrieved with a call to pwd.getgrnam(value) or None 122 | # to change the worker processes group. 123 | # 124 | # umask - A mask for file permissions written by Gunicorn. Note that 125 | # this affects unix socket permissions. 126 | # 127 | # A valid value for the os.umask(mode) call or a string 128 | # compatible with int(value, 0) (0 means Python guesses 129 | # the base, so values like "0", "0xFF", "0022" are valid 130 | # for decimal, hex, and octal representations) 131 | # 132 | # tmp_upload_dir - A directory to store temporary request data when 133 | # requests are read. This will most likely be disappearing soon. 134 | # 135 | # A path to a directory where the process owner can write. Or 136 | # None to signal that Python should choose one on its own. 137 | # 138 | 139 | # 140 | # Logging 141 | # 142 | # logfile - The path to a log file to write to. 143 | # 144 | # A path string. "-" means log to stdout. 145 | # 146 | # loglevel - The granularity of log output 147 | # 148 | # A string of "debug", "info", "warning", "error", "critical" 149 | # 150 | 151 | errorlog = '-' 152 | loglevel = 'error' 153 | accesslog = '-' 154 | access_log_format = '{"request_address": "%(h)s", ' \ 155 | '"request_time": "%(t)s", ' \ 156 | '"request": "%(r)s", ' \ 157 | '"http_status_code": "%(s)s", ' \ 158 | '"http_request_url": "%(U)s", ' \ 159 | '"http_query_string": "%(q)s", ' \ 160 | '"request_headers": {' \ 161 | '"content-type": "%({content-type}i)s", ' \ 162 | '"content-length": "%({content-length}i)s", ' \ 163 | '"user-agent": "%(a)s"' \ 164 | '}}' 165 | -------------------------------------------------------------------------------- /services/engine/gunicorn_logging.ini: -------------------------------------------------------------------------------- 1 | # Logging configuration 2 | 3 | [loggers] 4 | keys = root, gunicorn.access, gunicorn.error 5 | 6 | [handlers] 7 | keys = access, error, sentry 8 | 9 | [formatters] 10 | keys = json, generic 11 | 12 | # Root logger 13 | # The root logger sends messages to the console and to Sentry. 14 | [logger_root] 15 | handlers = error, sentry 16 | 17 | # Gunicorn loggers 18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'. 19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation. 20 | # Only the critical logger has a handler to send messages to Sentry. 21 | 22 | [logger_gunicorn.access] 23 | level = INFO 24 | handlers = access 25 | propagate = 0 26 | qualname = gunicorn.access 27 | 28 | [logger_gunicorn.error] 29 | level = ERROR 30 | handlers = error, sentry 31 | propagate = 0 32 | qualname = gunicorn.error 33 | 34 | # Handlers 35 | [handler_access] 36 | class = StreamHandler 37 | formatter = json 38 | args = (sys.stdout, ) 39 | 40 | [handler_error] 41 | class = StreamHandler 42 | formatter = json 43 | args = (sys.stderr,) 44 | 45 | [handler_sentry] 46 | class = raven.handlers.logging.SentryHandler 47 | level = ERROR 48 | formatter = generic 49 | sentry_dsn = example 50 | args = [%(sentry_dsn)s] 51 | 52 | [formatter_generic] 53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s 54 | [formatter_json] 55 | class = webs.api.utils.loggers.JSONFormatter -------------------------------------------------------------------------------- /services/engine/manage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import click 4 | from flask.cli import FlaskGroup 5 | from webs import create_app 6 | 7 | app = create_app() 8 | cli = FlaskGroup(create_app=create_app) 9 | 10 | 11 | @cli.command('add_spider_server') 12 | @click.argument('address') 13 | def _add_spider_server(address): 14 | from webs.api.utils.helper import add_spider_server 15 | add_spider_server(address) 16 | 17 | 18 | if __name__ == '__main__': 19 | cli() 20 | -------------------------------------------------------------------------------- /services/engine/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /services/engine/migrations/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # template used to generate migration files 5 | # file_template = %%(rev)s_%%(slug)s 6 | 7 | # set to 'true' to run the environment during 8 | # the 'revision' command, regardless of autogenerate 9 | # revision_environment = false 10 | 11 | 12 | # Logging configuration 13 | [loggers] 14 | keys = root,sqlalchemy,alembic 15 | 16 | [handlers] 17 | keys = console 18 | 19 | [formatters] 20 | keys = generic 21 | 22 | [logger_root] 23 | level = WARN 24 | handlers = console 25 | qualname = 26 | 27 | [logger_sqlalchemy] 28 | level = WARN 29 | handlers = 30 | qualname = sqlalchemy.engine 31 | 32 | [logger_alembic] 33 | level = INFO 34 | handlers = 35 | qualname = alembic 36 | 37 | [handler_console] 38 | class = StreamHandler 39 | args = (sys.stderr,) 40 | level = NOTSET 41 | formatter = generic 42 | 43 | [formatter_generic] 44 | format = %(levelname)-5.5s [%(name)s] %(message)s 45 | datefmt = %H:%M:%S 46 | -------------------------------------------------------------------------------- /services/engine/migrations/env.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import logging 4 | from logging.config import fileConfig 5 | 6 | from sqlalchemy import engine_from_config 7 | from sqlalchemy import pool 8 | 9 | from alembic import context 10 | 11 | # this is the Alembic Config object, which provides 12 | # access to the values within the .ini file in use. 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | fileConfig(config.config_file_name) 18 | logger = logging.getLogger('alembic.env') 19 | 20 | # add your model's MetaData object here 21 | # for 'autogenerate' support 22 | # from myapp import mymodel 23 | # target_metadata = mymodel.Base.metadata 24 | from flask import current_app 25 | config.set_main_option('sqlalchemy.url', 26 | current_app.config.get('SQLALCHEMY_DATABASE_URI')) 27 | target_metadata = current_app.extensions['migrate'].db.metadata 28 | 29 | # other values from the config, defined by the needs of env.py, 30 | # can be acquired: 31 | # my_important_option = config.get_main_option("my_important_option") 32 | # ... etc. 33 | 34 | 35 | def run_migrations_offline(): 36 | """Run migrations in 'offline' mode. 37 | 38 | This configures the context with just a URL 39 | and not an Engine, though an Engine is acceptable 40 | here as well. By skipping the Engine creation 41 | we don't even need a DBAPI to be available. 42 | 43 | Calls to context.execute() here emit the given string to the 44 | script output. 45 | 46 | """ 47 | url = config.get_main_option("sqlalchemy.url") 48 | context.configure( 49 | url=url, target_metadata=target_metadata, literal_binds=True 50 | ) 51 | 52 | with context.begin_transaction(): 53 | context.run_migrations() 54 | 55 | 56 | def run_migrations_online(): 57 | """Run migrations in 'online' mode. 58 | 59 | In this scenario we need to create an Engine 60 | and associate a connection with the context. 61 | 62 | """ 63 | 64 | # this callback is used to prevent an auto-migration from being generated 65 | # when there are no changes to the schema 66 | # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html 67 | def process_revision_directives(context, revision, directives): 68 | if getattr(config.cmd_opts, 'autogenerate', False): 69 | script = directives[0] 70 | if script.upgrade_ops.is_empty(): 71 | directives[:] = [] 72 | logger.info('No changes in schema detected.') 73 | 74 | connectable = engine_from_config( 75 | config.get_section(config.config_ini_section), 76 | prefix='sqlalchemy.', 77 | poolclass=pool.NullPool, 78 | ) 79 | 80 | with connectable.connect() as connection: 81 | context.configure( 82 | connection=connection, 83 | target_metadata=target_metadata, 84 | process_revision_directives=process_revision_directives, 85 | **current_app.extensions['migrate'].configure_args 86 | ) 87 | 88 | with context.begin_transaction(): 89 | context.run_migrations() 90 | 91 | 92 | if context.is_offline_mode(): 93 | run_migrations_offline() 94 | else: 95 | run_migrations_online() 96 | -------------------------------------------------------------------------------- /services/engine/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /services/engine/migrations/versions/1569921cac58_加入响应时间和渲染时间.py: -------------------------------------------------------------------------------- 1 | """加入响应时间和渲染时间 2 | 3 | Revision ID: 1569921cac58 4 | Revises: b3bd5bc9e4e3 5 | Create Date: 2021-04-07 17:11:13.336649 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '1569921cac58' 13 | down_revision = 'b3bd5bc9e4e3' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | # ### commands auto generated by Alembic - please adjust! ### 20 | op.add_column('results', sa.Column('load_complete_time', sa.Integer(), nullable=True)) 21 | op.add_column('results', sa.Column('response_time', sa.Integer(), nullable=True)) 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('results', 'response_time') 28 | op.drop_column('results', 'load_complete_time') 29 | # ### end Alembic commands ### 30 | -------------------------------------------------------------------------------- /services/engine/migrations/versions/5b189e0161ee_加入网站编码.py: -------------------------------------------------------------------------------- 1 | """加入网站编码 2 | 3 | Revision ID: 5b189e0161ee 4 | Revises: 1569921cac58 5 | Create Date: 2021-04-19 10:32:15.201074 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '5b189e0161ee' 14 | down_revision = '1569921cac58' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('results', sa.Column('charset', sa.String(length=256), nullable=True)) 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('results', 'charset') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /services/engine/migrations/versions/71bf761944f8_新增网站图标字段.py: -------------------------------------------------------------------------------- 1 | """新增网站图标字段 2 | 3 | Revision ID: 71bf761944f8 4 | Revises: 4a243739ef84 5 | Create Date: 2021-01-21 16:39:56.687514 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '71bf761944f8' 14 | down_revision = '4a243739ef84' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('results', sa.Column('favicon_link', sa.String(length=1024), nullable=True)) 22 | op.add_column('results', sa.Column('favicon_md5', sa.String(length=50), nullable=True)) 23 | # ### end Alembic commands ### 24 | 25 | 26 | def downgrade(): 27 | # ### commands auto generated by Alembic - please adjust! ### 28 | op.drop_column('results', 'favicon_md5') 29 | op.drop_column('results', 'favicon_link') 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /services/engine/migrations/versions/b3bd5bc9e4e3_增加extra客户端额外数据.py: -------------------------------------------------------------------------------- 1 | """增加extra客户端额外数据 2 | 3 | Revision ID: b3bd5bc9e4e3 4 | Revises: 71bf761944f8 5 | Create Date: 2021-01-25 17:52:28.285830 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'b3bd5bc9e4e3' 14 | down_revision = '71bf761944f8' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('tasks', sa.Column('extra_data', sa.Text(), nullable=True)) 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('tasks', 'extra_data') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /services/engine/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.2 2 | alembic==1.4.3 3 | amqp==2.6.1 4 | APScheduler==3.6.3 5 | async-timeout==3.0.1 6 | attrs==20.3.0 7 | beautifulsoup4==4.9.3 8 | billiard==3.6.3.0 9 | celery==4.3.0 10 | certifi==2020.11.8 11 | chardet==3.0.4 12 | click==7.1.2 13 | configobj==5.0.6 14 | Cython==0.29.21 15 | Flask==1.1.2 16 | Flask-Migrate==2.4.0 17 | Flask-Redis==0.3.0 18 | Flask-SQLAlchemy==2.3.2 19 | gevent==1.4.0 20 | greenlet==0.4.15 21 | grpcio==1.33.2 22 | grpcio-tools==1.33.2 23 | gunicorn==19.9.0 24 | html2text==2020.1.16 25 | idna==2.8 26 | importlib-metadata==2.0.0 27 | itsdangerous==1.1.0 28 | Jinja2==2.11.2 29 | kombu==4.6.11 30 | Mako==1.1.3 31 | MarkupSafe==1.1.1 32 | marshmallow==2.19.2 33 | multidict==5.0.2 34 | Pillow==8.2.0 35 | plumbum==1.6.9 36 | protobuf==3.12.2 --no-binary protobuf 37 | psycopg2-binary==2.7.6.1 38 | python-dateutil==2.8.1 39 | python-editor==1.0.4 40 | pytz==2020.4 41 | raven==6.10.0 42 | redis==3.5.3 43 | requests==2.22.0 44 | rpyc==4.1.5 45 | six==1.15.0 46 | soupsieve==2.1 47 | SQLAlchemy==1.3.20 48 | typing-extensions==3.7.4.3 49 | tzlocal==2.1 50 | urllib3==1.25.11 51 | vine==1.3.0 52 | webargs==4.0.0 53 | Werkzeug==1.0.1 54 | yarl==1.6.3 55 | zipp==3.4.0 56 | -------------------------------------------------------------------------------- /services/engine/rpc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/rpc/client/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/rpc/client/callback_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | 5 | import grpc 6 | 7 | from rpc.pb import callback_pb2 8 | from rpc.pb.callback_pb2_grpc import ResultStub 9 | 10 | CHUNK_SIZE = 10 * 1024 11 | 12 | 13 | class CallbackClient(object): 14 | 15 | def __init__(self, rpc_server): 16 | # RPC服务器信道 17 | channel = grpc.insecure_channel(target=f'{rpc_server}', options=[ 18 | ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024), 19 | ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024), 20 | ]) 21 | 22 | # 获取Result grpc服务对象 23 | self.stub = ResultStub(channel) 24 | 25 | def callback_save_result(self, task_obj, result): 26 | """ 27 | 回调爬虫信息 28 | :return: 29 | """ 30 | result['extra_data'] = task_obj.extra_data 31 | self.stub.SaveResult( 32 | callback_pb2.SaveResultRequest( 33 | customer_id=task_obj.customer_id, 34 | task_id=task_obj.id, 35 | task_status='executing', 36 | finished=False, 37 | crawl_results=json.dumps(result, ensure_ascii=False) 38 | ), 39 | timeout=30 40 | ) 41 | 42 | def callback_task_finished(self, customer_id, task_id): 43 | """回调任务完成""" 44 | 45 | self.stub.SaveResult( 46 | callback_pb2.SaveResultRequest( 47 | customer_id=customer_id, task_id=task_id, 48 | finished=True, task_status='finished'), 49 | timeout=30 50 | ) 51 | -------------------------------------------------------------------------------- /services/engine/rpc/codegen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 声明proto路径和pb文件生成路径 4 | declare -a proto_path=("protos") 5 | declare -a python_out=("pb") 6 | 7 | # 构造pb文件 8 | python -m grpc_tools.protoc \ 9 | --proto_path=$proto_path/ \ 10 | --python_out=$python_out \ 11 | --grpc_python_out=$python_out \ 12 | $proto_path/*.proto 13 | 14 | # 替换pb文件的错误引入语句 15 | sed -i '' -E 's/^import (.*pb2)/from . import \1/g' ${python_out}/*pb2*.py -------------------------------------------------------------------------------- /services/engine/rpc/pb/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/rpc/pb/callback_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | 5 | from . import callback_pb2 as result__pb2 6 | 7 | 8 | class ResultStub(object): 9 | """定义服务 10 | """ 11 | 12 | def __init__(self, channel): 13 | """Constructor. 14 | 15 | Args: 16 | channel: A grpc.Channel. 17 | """ 18 | self.SaveResult = channel.unary_unary( 19 | '/result.Result/SaveResult', 20 | request_serializer=result__pb2.SaveResultRequest.SerializeToString, 21 | response_deserializer=result__pb2.SaveResultResponse.FromString, 22 | ) 23 | 24 | 25 | class ResultServicer(object): 26 | """定义服务 27 | """ 28 | 29 | def SaveResult(self, request, context): 30 | """保存基本爬取信息 31 | """ 32 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 33 | context.set_details('Method not implemented!') 34 | raise NotImplementedError('Method not implemented!') 35 | 36 | 37 | def add_ResultServicer_to_server(servicer, server): 38 | rpc_method_handlers = { 39 | 'SaveResult': grpc.unary_unary_rpc_method_handler( 40 | servicer.SaveResult, 41 | request_deserializer=result__pb2.SaveResultRequest.FromString, 42 | response_serializer=result__pb2.SaveResultResponse.SerializeToString, 43 | ), 44 | } 45 | generic_handler = grpc.method_handlers_generic_handler( 46 | 'result.Result', rpc_method_handlers) 47 | server.add_generic_rpc_handlers((generic_handler,)) 48 | 49 | 50 | # This class is part of an EXPERIMENTAL API. 51 | class Result(object): 52 | """定义服务 53 | """ 54 | 55 | @staticmethod 56 | def SaveResult(request, 57 | target, 58 | options=(), 59 | channel_credentials=None, 60 | call_credentials=None, 61 | insecure=False, 62 | compression=None, 63 | wait_for_ready=None, 64 | timeout=None, 65 | metadata=None): 66 | return grpc.experimental.unary_unary(request, target, '/result.Result/SaveResult', 67 | result__pb2.SaveResultRequest.SerializeToString, 68 | result__pb2.SaveResultResponse.FromString, 69 | options, channel_credentials, 70 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 71 | -------------------------------------------------------------------------------- /services/engine/rpc/protos/result.proto: -------------------------------------------------------------------------------- 1 | // 使用proto3协议 2 | syntax = "proto3"; 3 | 4 | // 当前包 5 | package result; 6 | 7 | 8 | // 定义服务 9 | service Result { 10 | // 保存基本爬取信息 11 | rpc SaveBaseResult(SaveBaseResultRequest) returns (SaveBaseResultResponse); 12 | // 基于流式传输图片 13 | rpc StreamUploadPicture(stream StreamUploadPictureRequest) returns (StreamUploadPictureResponse); 14 | // 基于流式传输har文件 15 | rpc StreamUploadHarFile(stream StreamUploadHarFileRequest) returns (StreamUploadHarFileResponse); 16 | // 标记子任务爬取状态 17 | rpc SetSubTaskStatus(SetSubTaskStatusRequest) returns (SetSubTaskStatusResponse); 18 | } 19 | 20 | 21 | // 请求参数 22 | message SaveBaseResultRequest { 23 | int32 subtask_id = 1; 24 | int32 url_id = 2; 25 | string url_address = 3; 26 | int32 http_code = 4; 27 | string title = 5; 28 | string content = 6; 29 | string current_url = 7; 30 | string screenshot_id = 8; 31 | string response_headers = 9; 32 | string finished_at = 10; 33 | string har_uuid = 11; 34 | repeated RedirectChain redirect_chain = 12; 35 | repeated Cookies cookies = 13; 36 | int32 response_time = 14; 37 | int32 load_complete_time = 15; 38 | } 39 | 40 | 41 | // 网站重定向链接链表 42 | message RedirectChain { 43 | string redirect_url = 1; 44 | int32 redirect_http_code = 2; 45 | } 46 | 47 | 48 | // Cookies 49 | message Cookies { 50 | string name = 1; 51 | string path = 2; 52 | string value = 3; 53 | string domain = 4; 54 | bool secure = 5; 55 | int64 expires = 6; 56 | bool httpOnly = 7; 57 | string sameSite = 8; 58 | } 59 | 60 | 61 | // 响应 62 | message SaveBaseResultResponse{ 63 | bool status = 1; 64 | } 65 | 66 | // 图片流 67 | message StreamUploadPictureRequest { 68 | message FileData { 69 | string filename = 1; 70 | bytes buffer = 2; 71 | } 72 | oneof payload { 73 | string filename = 1; 74 | FileData file_data = 2; 75 | } 76 | } 77 | 78 | // 图片长度 79 | message StreamUploadPictureResponse { 80 | int32 length = 1; 81 | } 82 | 83 | // 子任务id、状态 84 | message SetSubTaskStatusRequest { 85 | int32 subtask_id = 1; 86 | bool status = 2; 87 | string finished_at = 3; 88 | } 89 | 90 | // 设置子任务状态响应 91 | message SetSubTaskStatusResponse { 92 | bool set_success = 1; 93 | } 94 | 95 | // har文件流 96 | message StreamUploadHarFileRequest { 97 | message FileData { 98 | string filename = 1; 99 | bytes buffer = 2; 100 | } 101 | oneof payload { 102 | string filename = 1; 103 | FileData file_data = 2; 104 | } 105 | } 106 | 107 | // har文件流大小 108 | message StreamUploadHarFileResponse { 109 | int32 length = 1; 110 | } -------------------------------------------------------------------------------- /services/engine/rpc/server/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/rpc/server/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | from google.protobuf.json_format import MessageToDict 6 | 7 | from manage import app 8 | from rpc.pb import result_pb2, result_pb2_grpc 9 | from webs.api.models.db_proxy import subtask_model_proxy, task_model_proxy, schedule_task_proxy, apscheduler_model_proxy 10 | from worker import celery_app 11 | 12 | 13 | def save_chunks_to_file(request_streams, folder_path): 14 | filename, file_chunks = None, [] 15 | for yield_obj in request_streams: 16 | if getattr(yield_obj, 'filename'): 17 | filename = yield_obj.filename 18 | else: 19 | file_chunks.append(yield_obj.file_data) 20 | with open(f'/usr/src/app/{folder_path}/{filename}', 'wb') as f: 21 | for chunk in file_chunks: 22 | f.write(chunk.buffer) 23 | return os.path.getsize(f'/usr/src/app/{folder_path}/{filename}') 24 | 25 | 26 | class ResultServicer(result_pb2_grpc.ResultServicer): 27 | # 创建截图目录 28 | if not os.path.exists('/usr/src/app/screenshots'): 29 | os.mkdir('/usr/src/app/screenshots') 30 | 31 | # 实现SaveBaseResult方法的rpc调用 32 | def SaveBaseResult(self, request, context): 33 | # proto消息体参数转为字典 34 | request_dict = MessageToDict(request, preserving_proto_field_name=True) 35 | 36 | # 异步处理相关爬取数据 37 | celery_app.send_task('save_base_result', queue='base_result', kwargs=request_dict) 38 | 39 | return result_pb2.SaveBaseResultResponse(status=True) 40 | 41 | # 实现StreamUploadPicture流式处理传输图片的rpc调用 42 | def StreamUploadPicture(self, request_iterator, context): 43 | try: 44 | file_length = save_chunks_to_file(request_iterator, folder_path='screenshots') 45 | except FileNotFoundError: 46 | file_length = 0 47 | return result_pb2.StreamUploadPictureResponse(length=file_length) 48 | 49 | # 实现SetSubTaskStatus标记子任务处理状态 50 | def SetSubTaskStatus(self, request, context): 51 | # 在flask上下文中设置子任务状态 52 | with app.app_context(): 53 | ###### 设置子任务状态 54 | subtask_obj = subtask_model_proxy.set_many_attr( 55 | obj_id=request.subtask_id, 56 | fields_v={'finished': request.status, 'finished_at': request.finished_at} 57 | ) 58 | ###### 设置调度任务状态 59 | unfinished_count = subtask_model_proxy.query_unfinished_subtask_count(subtask_obj.schedule_task_id) 60 | if unfinished_count == 0: 61 | schedule_task_obj = schedule_task_proxy.query_schedule_task_obj_by_subtask_id(subtask_obj.id) 62 | schedule_task_proxy.set_many_attr( 63 | obj=schedule_task_obj, fields_v={'schedule_task_status': 'finished', 'finished': True} 64 | ) 65 | 66 | # 查询主任务 67 | task_id, running_schedule_tasks = schedule_task_proxy.query_running_task_and_task_id( 68 | subtask_obj.schedule_task_id) 69 | task_obj = task_model_proxy.find(id=task_id) 70 | 71 | # 回调当前调度任务完成 72 | if schedule_task_obj.crawl_options.get('callback_type'): 73 | from rpc.client.callback_client import CallbackClient 74 | try: 75 | callback_client = CallbackClient(rpc_server=task_obj.crawl_options.get('callback_address')) 76 | callback_client.callback_task_finished(customer_id=task_obj.customer_id, task_id=task_id) 77 | except Exception as e: 78 | print(e) 79 | print(f"回调任务完成失败:ID-{task_id}") 80 | 81 | # 设置主任务为完结状态 82 | next_run_time = apscheduler_model_proxy.get_next_run_time(apschedule_id=task_id) 83 | if not running_schedule_tasks and not next_run_time: 84 | task_model_proxy.set_many_attr( 85 | obj=task_obj, fields_v={'task_status': 'finished', 'finished': True} 86 | ) 87 | 88 | return result_pb2.SetSubTaskStatusResponse(set_success=True) 89 | 90 | # 实现StreamUploadHarFile流式处理传输文件的rpc调用 91 | def StreamUploadHarFile(self, request_iterator, context): 92 | try: 93 | file_length = save_chunks_to_file(request_iterator, folder_path='hars') 94 | except FileNotFoundError: 95 | file_length = 0 96 | return result_pb2.StreamUploadPictureResponse(length=file_length) 97 | -------------------------------------------------------------------------------- /services/engine/sources.list: -------------------------------------------------------------------------------- 1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free 2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free 3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free 6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free -------------------------------------------------------------------------------- /services/engine/wappalyzer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from wappalyzer.wappalyzerhandler import WappalyzerHandler 4 | 5 | wappalyzer_handler = WappalyzerHandler(techno_path='wappalyzer/data.json') 6 | -------------------------------------------------------------------------------- /services/engine/wappalyzer/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from typing import Any 6 | import re 7 | 8 | from wappalyzer.modelcalss import Pattern 9 | 10 | 11 | def _transform_patterns( 12 | patterns: Any, 13 | case_sensitive: bool = False 14 | ): 15 | """Canonicalize the patterns of different sections. 16 | """ 17 | 18 | def to_list(value): 19 | return value if type(value) is list else [value] 20 | 21 | if not patterns: 22 | return [] 23 | 24 | if type(patterns) is str or type(patterns) is list: 25 | patterns = { 26 | "main": patterns 27 | } 28 | 29 | parsed = {} 30 | for key in patterns: 31 | name = key if case_sensitive else key.lower() 32 | parsed[name] = [ 33 | _parse_pattern(ptrn, key) 34 | for ptrn in to_list(patterns[key]) 35 | ] 36 | 37 | return parsed["main"] if "main" in parsed else parsed 38 | 39 | 40 | def _parse_pattern(pattern: str, key: str = ""): 41 | """Parse the regex pattern and creates a Pattern object. 42 | It extracts the regex, the version and the confidence values of 43 | the given string. 44 | """ 45 | parts = pattern.split("\\;") 46 | 47 | value = parts[0] 48 | 49 | # seems that in js "[^]" is similar to ".", however python 50 | # re interprets in a diferent way (which leads to an error), 51 | # so it is better to substitute it 52 | regex = value.replace("/", "\\/").replace("[^]", ".") 53 | 54 | attrs = { 55 | "value": value, 56 | "regex": re.compile(regex, re.I) 57 | } 58 | for attr in parts[1:]: 59 | attr = attr.split(":") 60 | if len(attr) > 1: 61 | attrs[attr[0]] = ":".join(attr[1:]) 62 | 63 | return Pattern( 64 | value=attrs["value"], 65 | regex=attrs["regex"], 66 | confidence=int(attrs.get("confidence", 100)), 67 | version=attrs.get("version", ""), 68 | key=key, 69 | ) 70 | 71 | 72 | def extract_scripts(html: str): 73 | soup = BeautifulSoup(html, "html.parser") 74 | script_tags = soup.findAll("script") 75 | 76 | scripts = [] 77 | for script_tag in script_tags: 78 | try: 79 | src = script_tag.attrs["src"] 80 | if not src.startswith("data:text/javascript;"): 81 | scripts.append(src) 82 | except KeyError: 83 | pass 84 | 85 | return scripts 86 | 87 | 88 | def extract_metas(html: str): 89 | soup = BeautifulSoup(html, "html.parser") 90 | meta_tags = soup.findAll("meta") 91 | 92 | metas = {} 93 | for meta_tag in meta_tags: 94 | try: 95 | key = meta_tag.attrs.get("name", None) \ 96 | or meta_tag.attrs["property"] 97 | metas[key.lower()] = [meta_tag.attrs["content"]] 98 | except KeyError: 99 | continue 100 | 101 | return metas 102 | 103 | 104 | def extract_cookies(cookies_list): 105 | cookies_dict = {} 106 | for each_cookie in cookies_list: 107 | cookies_dict.update({each_cookie['name']: each_cookie['value']}) 108 | return cookies_dict 109 | 110 | 111 | def extract_headers(headers): 112 | return { 113 | k.lower(): [v] 114 | for k, v in headers.items() 115 | } 116 | -------------------------------------------------------------------------------- /services/engine/wappalyzer/modelcalss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Technology: 4 | def __init__( 5 | self, 6 | name: str, 7 | categories, 8 | url, 9 | headers, 10 | cookies, 11 | html, 12 | meta, 13 | scripts, 14 | js, 15 | implies, 16 | excludes, 17 | icon: str, 18 | website: str, 19 | cpe: str, 20 | ): 21 | self.name = name 22 | self.categories = categories 23 | self.url = url 24 | self.headers = headers 25 | self.cookies = cookies 26 | self.html = html 27 | self.meta = meta 28 | self.scripts = scripts 29 | self.js = js 30 | self.implies = implies 31 | self.excludes = excludes 32 | self.icon = icon 33 | self.website = website 34 | self.cpe = cpe 35 | 36 | def __getitem__(self, k): 37 | return self.__dict__[k] 38 | 39 | def get(self, *args, **kwargs): 40 | return self.__dict__.get(*args, **kwargs) 41 | 42 | def __repr__(self): 43 | return repr(self.__dict__) 44 | 45 | 46 | class Category: 47 | def __init__(self, id: str, name: str, priority: int): 48 | self.id = id 49 | self.name = name 50 | self.priority = priority 51 | 52 | 53 | class Pattern: 54 | 55 | def __init__( 56 | self, 57 | value: str, 58 | regex, 59 | confidence: int, 60 | version: str, 61 | key: str 62 | ): 63 | self.value = value 64 | self.regex = regex 65 | self.confidence = confidence 66 | self.version = version 67 | self.key = key 68 | 69 | def __getitem__(self, k): 70 | return self.__dict__[k] 71 | 72 | def __repr__(self): 73 | return repr(self.__dict__) 74 | 75 | 76 | class Imply: 77 | """Structure to define a technology that is implied by the use of another 78 | one. 79 | 80 | Attributes: 81 | name (str): Name of the implied technology. 82 | confidence (int): Confidence of the implied technology. 83 | 84 | """ 85 | 86 | def __init__(self, name: str, confidence: int): 87 | self.name = name 88 | self.confidence = confidence 89 | 90 | 91 | class Exclude: 92 | """Structure to define a technology that is incompatible with another 93 | one. 94 | 95 | Attributes: 96 | name (str): Name of the excluded technology. 97 | 98 | """ 99 | 100 | def __init__(self, name: str): 101 | self.name = name 102 | 103 | 104 | class PatternMatch: 105 | """Identifies a match in a technology pattern. 106 | 107 | Attributes: 108 | technology (Technology): Technology identified by the pattern. 109 | pattern (Pattern): Pattern that cause the match. 110 | version (str): Version identified by the pattern in the match. 111 | """ 112 | 113 | def __init__(self, technology: Technology, pattern: Pattern, version: str): 114 | self.technology = technology 115 | self.pattern = pattern 116 | self.version = version 117 | 118 | def __getitem__(self, k): 119 | return self.__dict__[k] 120 | 121 | def __repr__(self): 122 | return repr(self.__dict__) 123 | 124 | def __eq__(self, o): 125 | return ( 126 | self.technology.name == o.technology.name 127 | and self.pattern.key == self.pattern.key 128 | and self.pattern.value == self.pattern.value 129 | ) 130 | 131 | def __hash__(self): 132 | return hash( 133 | (self.technology.name, self.pattern.key, self.pattern.value) 134 | ) 135 | 136 | 137 | class TechMatch: 138 | """Identifies a match in a technology. 139 | 140 | Attributes: 141 | technology (Technology): Technology identified. 142 | confidence (int): Confidence in the match, is derivated from all the 143 | patterns of this technology that matched. 144 | version (str): Version identified by the patterns. 145 | """ 146 | 147 | def __init__(self, technology: Technology, confidence: int, version: str): 148 | self.technology = technology 149 | self.confidence = confidence 150 | self.version = version 151 | 152 | def __getitem__(self, k): 153 | return self.__dict__[k] 154 | 155 | def __repr__(self): 156 | return repr(self.__dict__) 157 | 158 | def __eq__(self, o): 159 | return self.technology.name == o.technology.name 160 | -------------------------------------------------------------------------------- /services/engine/webs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | from flask import Flask 6 | 7 | from webs.api.utils.requests import before_request_middleware, \ 8 | after_request_middleware, teardown_appcontext_middleware 9 | from webs.api.utils.responses import JSONResponse, app_error_handler 10 | from webs.api.utils.routers import register_routes as init_routes 11 | from webs.api.utils.settings import init_db 12 | 13 | 14 | def create_app(): 15 | # instantiate the app 16 | app = Flask(__name__) 17 | 18 | # set config 19 | app_settings = os.getenv('APP_SETTINGS') 20 | app.config.from_object(app_settings) 21 | 22 | # register all blueprints 23 | init_routes(app=app) 24 | 25 | # register custom response class 26 | app.response_class = JSONResponse 27 | 28 | # register custom error handler 29 | app_error_handler(app=app) 30 | 31 | # register before request middleware 32 | before_request_middleware(app=app) 33 | 34 | # register after request middleware 35 | after_request_middleware(app=app) 36 | 37 | # register after app context teardown middleware 38 | teardown_appcontext_middleware(app=app) 39 | 40 | # set up extensions 41 | app_db = init_db(app=app) 42 | 43 | # shell context for flask cli 44 | @app.shell_context_processor 45 | def ctx(): 46 | return {'app': app, 'db': app_db} 47 | 48 | return app 49 | -------------------------------------------------------------------------------- /services/engine/webs/api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/api/bizs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/api/bizs/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import base64 4 | import os 5 | from io import BytesIO 6 | 7 | from PIL import Image 8 | from flask import make_response, send_file 9 | 10 | from webs.api.exceptions.customs import RecordNotFound, InvalidAPIRequest 11 | from webs.api.models.db_proxy import result_model_proxy 12 | 13 | 14 | class ResultBiz(object): 15 | 16 | def result_by_url(self, url, fields): 17 | """ 18 | 根据url查询结果 19 | :param url: 20 | :param fields: 21 | :return: 22 | """ 23 | 24 | # 获取url id 25 | 26 | return result_model_proxy.get_by_url(url, fields) 27 | 28 | def result_by_id(self, result_id): 29 | """ 30 | 根据result查询结果 31 | :param result_id: 32 | :return: 33 | """ 34 | return result_model_proxy.get_by_result_id(result_id) 35 | 36 | def get_screenshot(self, screenshot_id, download=False): 37 | """ 38 | 获取截图 39 | :param screenshot_id: 40 | :param download: 41 | :return: 42 | """ 43 | 44 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png' 45 | if not os.path.exists(screenshot_path): 46 | raise RecordNotFound('截图不存在!') 47 | response = make_response(send_file( 48 | filename_or_fp=screenshot_path, 49 | as_attachment=download 50 | )) 51 | response.direct_passthrough = False 52 | return response 53 | 54 | def get_screenshot_base64_encode(self, screenshot_id): 55 | """ 56 | 获取截图Base64编码 57 | :param screenshot_id: 58 | :return: 59 | """ 60 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png' 61 | if not os.path.exists(screenshot_path): 62 | raise RecordNotFound('截图不存在!') 63 | with open(screenshot_path, 'rb') as f: 64 | img_encode_str = base64.b64encode(f.read()).decode('utf-8') 65 | return img_encode_str 66 | 67 | def download_har(self, har_uuid): 68 | """ 69 | 下载har文件 70 | :param har_uuid: 71 | :return: 72 | """ 73 | har_path = f'/usr/src/app/hars/{har_uuid}.json' 74 | if not os.path.exists(har_path): 75 | raise RecordNotFound('该文件不存在!') 76 | response = make_response(send_file( 77 | filename_or_fp=har_path, 78 | as_attachment=True 79 | )) 80 | response.direct_passthrough = False 81 | return response 82 | 83 | def get_favicon(self, favicon_md5, download=False): 84 | """ 85 | 获取图标 86 | :param favicon_md5: 87 | :param download: 88 | :return: 89 | """ 90 | 91 | newest_record = result_model_proxy.find(favicon_md5=favicon_md5) 92 | if not newest_record: 93 | raise RecordNotFound('图标不存在!') 94 | _, ext = os.path.splitext(newest_record.favicon_link) 95 | favicon_path = f'/usr/src/app/screenshots/{favicon_md5}{ext}' 96 | if not os.path.exists(favicon_path): 97 | raise RecordNotFound('图标不存在!') 98 | response = make_response(send_file( 99 | filename_or_fp=favicon_path, 100 | as_attachment=download 101 | )) 102 | response.direct_passthrough = False 103 | return response 104 | 105 | def get_small_screenshot(self, screenshot_id, wide, high): 106 | """查看图片缩略图""" 107 | 108 | screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png' 109 | if not os.path.exists(screenshot_path): 110 | raise RecordNotFound('截图不存在!') 111 | im = Image.open(f'/usr/src/app/screenshots/{screenshot_id}.png') 112 | 113 | src_wide, src_high = im.size 114 | ratio = src_wide / wide 115 | im = im.resize((wide, int(src_high / ratio)), Image.ANTIALIAS) 116 | im = im.crop((0, 0, wide, high)) 117 | 118 | # 存入临时内存中 119 | byte_io = BytesIO() 120 | im.save(byte_io, 'PNG') 121 | byte_io.seek(0) 122 | 123 | response = make_response(send_file( 124 | filename_or_fp=byte_io, 125 | as_attachment=False, 126 | mimetype='image/png' 127 | # attachment_filename=f'{screenshot_id}.png' 128 | )) 129 | response.direct_passthrough = False 130 | return response 131 | -------------------------------------------------------------------------------- /services/engine/webs/api/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/api/exceptions/customs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from werkzeug.exceptions import BadRequest, \ 5 | NotFound, Unauthorized, Forbidden, InternalServerError, Conflict 6 | 7 | 8 | class CustomException(Exception): 9 | """Custom JSON based exception.""" 10 | 11 | status_code = BadRequest.code 12 | message = "" 13 | 14 | def __init__(self, message=None, status_code=None): 15 | """ 16 | :param status_code: response status_code 17 | :param message: exception message 18 | """ 19 | 20 | Exception.__init__(self) 21 | 22 | if message is not None: 23 | self.message = message 24 | if status_code is not None: 25 | self.status_code = status_code 26 | 27 | def to_dict(self): 28 | return { 29 | "status": False, 30 | "error": { 31 | "message": self.message, 32 | "type": str(self.__class__.__name__) 33 | } 34 | } 35 | 36 | 37 | class InvalidContentType(CustomException): 38 | """ 39 | Raised when an invalid Content-Type is provided. 40 | """ 41 | 42 | status_code = BadRequest.code 43 | 44 | 45 | class UnauthorizedAPIRequest(CustomException): 46 | """ 47 | Raise if the user is not authorized. Also used if you want to use HTTP 48 | basic auth. 49 | """ 50 | 51 | status_code = Unauthorized.code 52 | 53 | 54 | class InvalidPermissions(CustomException): 55 | """ 56 | Raise if the user doesn't have the permission for the requested resource 57 | but was authenticated. 58 | """ 59 | 60 | status_code = Forbidden.code 61 | 62 | 63 | class InvalidAPIRequest(CustomException): 64 | """ 65 | Raised when an invalid request has been made. 66 | (e.g. accessed unexisting url, the schema validation did 67 | not pass) 68 | """ 69 | 70 | status_code = BadRequest.code 71 | 72 | 73 | class ServerError(CustomException): 74 | """ 75 | Generic internal error. 76 | Inherit this error for all subsequent 77 | errors that are related to database. 78 | """ 79 | 80 | status_code = InternalServerError.code 81 | 82 | 83 | class DatabaseError(CustomException): 84 | """ 85 | Generic database interaction error. 86 | Inherit this error for all subsequent 87 | errors that are related to database. 88 | """ 89 | 90 | status_code = InternalServerError.code 91 | 92 | 93 | class RecordNotFound(DatabaseError): 94 | """ 95 | Raised when the record was not found in the database. 96 | """ 97 | 98 | status_code = NotFound.code 99 | 100 | 101 | class RecordAlreadyExists(DatabaseError): 102 | """ 103 | Raised in the case of violation of a unique constraint. 104 | """ 105 | 106 | status_code = Conflict.code 107 | 108 | 109 | class PublishError(CustomException): 110 | """ 111 | Raised in the case of violation of a publish error. 112 | """ 113 | 114 | status_code = InternalServerError.code 115 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from .base_model import db, redis_store 5 | from .task import Task, SubTask, ScheduleTaskRecord 6 | from .url import Url 7 | from .task_url import TaskUrl 8 | from .server import Server 9 | from .result import Result 10 | from .apscheduler_job import APSchedulerJobs 11 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/apscheduler_job.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from sqlalchemy import Column, types 5 | from sqlalchemy.dialects import postgresql 6 | 7 | from webs.api.models import db 8 | 9 | """ 10 | APScheduler任务存储表 11 | """ 12 | 13 | 14 | class APSchedulerJobs(db.Model): 15 | __tablename__ = 'apscheduler_jobs' 16 | 17 | id = Column(types.String(length=191), primary_key=True) 18 | next_run_time = Column(postgresql.DOUBLE_PRECISION(precision=53), index=True) 19 | job_state = Column(postgresql.BYTEA(), nullable=False) 20 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/base_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask_sqlalchemy import SQLAlchemy 4 | from flask_redis import FlaskRedis 5 | 6 | db = SQLAlchemy() 7 | redis_store = FlaskRedis() 8 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .apschedule import ApschedulerModelProxy 3 | from .result import ResultModelProxy 4 | from .schedule_task import ScheduleTaskProxy 5 | from .server import ServerModelProxy 6 | from .task import TaskModelProxy 7 | from .task_url import TaskUrlModelProxy 8 | from .url import UrlModelProxy 9 | from .subtask import SubTaskModelProxy 10 | 11 | task_model_proxy = TaskModelProxy() 12 | schedule_task_proxy = ScheduleTaskProxy() 13 | url_model_proxy = UrlModelProxy() 14 | task_url_model_proxy = TaskUrlModelProxy() 15 | server_model_proxy = ServerModelProxy() 16 | subtask_model_proxy = SubTaskModelProxy() 17 | result_model_proxy = ResultModelProxy() 18 | apscheduler_model_proxy = ApschedulerModelProxy() 19 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/apschedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | from datetime import datetime 4 | 5 | from webs.api.models import APSchedulerJobs 6 | from webs.api.models.db_proxy.base import BaseModelProxy 7 | 8 | 9 | class ApschedulerModelProxy(BaseModelProxy): 10 | def __init__(self): 11 | super().__init__() 12 | self.model = APSchedulerJobs 13 | 14 | def get_next_run_time(self, apschedule_id): 15 | """ 16 | 获取下一次任务执行时间 17 | :param apschedule_id: 18 | :return: 19 | """ 20 | schedule_obj = self.find(id=str(apschedule_id)) 21 | if schedule_obj and schedule_obj.next_run_time: 22 | return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(schedule_obj.next_run_time)) 23 | return 24 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import Result 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class ResultModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = Result 11 | 12 | def save_model_by_grpc(self, **kwargs): 13 | """ 14 | 基于rpc回收爬虫数据 15 | :param kwargs: 16 | :return: 17 | """ 18 | 19 | obj = Result( 20 | subtask_id=kwargs['subtask_id'], url_id=kwargs['url_id'], url_address=kwargs['url_address'], 21 | http_code=kwargs.get('http_code'), title=kwargs.get('title'), content=kwargs.get('content'), 22 | current_url=kwargs.get('current_url'), redirect_chain=kwargs.get('redirect_chain', []), 23 | response_headers=kwargs.get('response_headers', {}), har_uuid=kwargs.get('har_uuid'), 24 | screenshot_id=kwargs.get('screenshot_id'), finished_at=kwargs['finished_at'], 25 | cookies=kwargs.get('cookies', []), wappalyzer_results=kwargs.get('wappalyzer_results', []), 26 | text=kwargs.get('text'), favicon_md5=kwargs.get('favicon_md5'), favicon_link=kwargs.get('favicon_link'), 27 | response_time=kwargs.get('response_time'), load_complete_time=kwargs.get('load_complete_time'), 28 | charset=kwargs.get('charset') 29 | ) 30 | self.db_session.add(obj) 31 | self.db_session.flush() 32 | self.safe_commit() 33 | return obj 34 | 35 | def get_by_url(self, url, fields): 36 | """ 37 | 基于url取结果 38 | :param url: 39 | :param fields: 40 | :return: 41 | """ 42 | 43 | # obj = self.self_session.filter( 44 | # or_(self.model.url_address == url.rstrip('/'), self.model.url_address == url.rstrip('/') + '/')) \ 45 | # .order_by(self.model.finished_at.desc()).first() 46 | from webs.api.models.db_proxy import url_model_proxy 47 | url_obj = url_model_proxy.find(address=url) 48 | if not url_obj: 49 | return {} 50 | 51 | # 查询所有记录 52 | objs = self.self_session.filter(self.model.url_id == url_obj.id) \ 53 | .order_by(self.model.id.desc()).all() 54 | 55 | latest_record = {} 56 | if objs: 57 | latest_record = objs[0].as_dict() 58 | latest_record['other_records'] = [{ 59 | 'result_id': each.id, 60 | 'finished_at': each.finished_at.strftime("%Y-%m-%d %H:%M:%S")} 61 | for each in objs[1:] 62 | ] 63 | 64 | if fields: latest_record = {each: latest_record[each] for each in fields if each in latest_record} 65 | 66 | return latest_record 67 | 68 | def get_by_result_id(self, result_id): 69 | """ 70 | 基于id取结果 71 | :param result_id: 72 | :return: 73 | """ 74 | 75 | obj = self.self_session.filter(self.model.id == result_id).order_by(self.model.finished_at.desc()).first() 76 | return {} if not obj else obj.as_dict() 77 | 78 | def get_favicon_data_by_url(self, url): 79 | """ 80 | 根据url获取已存在的图标信息 81 | :param url: 82 | :return: 83 | """ 84 | 85 | obj = self.db_session.query(self.model.favicon_md5, self.model.favicon_link) \ 86 | .filter(self.model.url_address == url).order_by(self.model.create_time.desc()).first() 87 | return (None, None) if not obj else (obj[0], obj[1]) 88 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/schedule_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import ScheduleTaskRecord, SubTask 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class ScheduleTaskProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = ScheduleTaskRecord 11 | 12 | def query_schedule_task_obj_by_subtask_id(self, subtask_id): 13 | """ 14 | 基于子任务查询调度任务 15 | :return: 16 | """ 17 | 18 | return self.self_session \ 19 | .join(SubTask, SubTask.schedule_task_id == self.model.id) \ 20 | .filter(SubTask.id == subtask_id).first() 21 | 22 | def query_running_schedule_tasks(self, task_id): 23 | """ 24 | 查询正在执行中的调度任务 25 | :param task_id: 26 | :return: 27 | """ 28 | return self.self_session.filter( 29 | self.model.task_id == task_id, 30 | self.model.finished.is_(False) 31 | ).all() 32 | 33 | def query_running_task_and_task_id(self, schedule_task_id): 34 | """ 35 | 查询主任务下正在执行调度任务 36 | :param schedule_task_id: 37 | :return: 38 | """ 39 | schedule_task_obj = self.find(id=schedule_task_id) 40 | return schedule_task_obj.task_id, self.query_running_schedule_tasks(schedule_task_obj.task_id) 41 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import desc, asc 3 | 4 | from webs.api.models import Server 5 | from webs.api.models.db_proxy.base import BaseModelProxy 6 | 7 | 8 | class ServerModelProxy(BaseModelProxy): 9 | def __init__(self): 10 | super().__init__() 11 | self.model = Server 12 | 13 | def query_servers_by_score(self, sort='desc'): 14 | """ 15 | 根据权重和负载计算服务器得分 16 | :return: 17 | """ 18 | 19 | query = self.self_session.filter(self.model.enabled.is_(True), self.model.status.is_(True)).all() 20 | results = [{ 21 | 'server_id': each_obj.id, 22 | 'server_name': each_obj.server_name, 23 | 'server_address': each_obj.server_address, 24 | 'score': int((1 - float(each_obj.load)) * each_obj.weight * 10) 25 | } for each_obj in query] 26 | return sorted(results, key=lambda x: x['score'], reverse=True if sort == 'desc' else False) 27 | 28 | def add_server(self, address): 29 | """新增爬虫服务器节点""" 30 | obj = Server(server_name=address, server_address=address) 31 | self.db_session.add(obj) 32 | self.safe_commit() 33 | return 34 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/subtask.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import SubTask 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class SubTaskModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = SubTask 11 | 12 | def create(self, schedule_task_id, server_id): 13 | """ 14 | 创建子任务 15 | :param schedule_task_id: 16 | :param server_id: 17 | :return: 18 | """ 19 | 20 | obj = SubTask(schedule_task_id=schedule_task_id, server_id=server_id) 21 | self.db_session.add(obj) 22 | self.safe_commit() 23 | return obj 24 | 25 | def query_delivery_failure_count(self, schedule_task_id): 26 | """ 27 | 查询下发失败的子任务 28 | :return: 29 | """ 30 | return self.self_session.filter( 31 | self.model.schedule_task_id == schedule_task_id, 32 | self.model.delivery_failure_msg.isnot(None) 33 | ).count() 34 | 35 | def query_unfinished_subtask_count(self, schedule_task_id): 36 | """ 37 | 根据子任务id查询当前调度任务未完成的子任务数量 38 | :param schedule_task_id: 39 | :return: 40 | """ 41 | return self.self_session.filter( 42 | self.model.schedule_task_id == schedule_task_id, self.model.finished.is_(False) 43 | ).count() 44 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import Task, SubTask, TaskUrl, Result, ScheduleTaskRecord 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class TaskModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = Task 11 | 12 | def create(self, 13 | customer_id=None, 14 | task_name=None, 15 | extra_data=None, 16 | task_status='executing', 17 | crawl_options={}, 18 | **kwargs): 19 | """ 20 | :return: 21 | """ 22 | obj = Task( 23 | customer_id=customer_id, task_name=task_name, 24 | task_status=task_status, crawl_options=crawl_options, extra_data=extra_data, 25 | schedule_options={'schedule_type': kwargs['schedule_type'], 'schedule_data': kwargs['schedule_data']}) 26 | self.db_session.add(obj) 27 | self.db_session.flush() 28 | self.safe_commit() 29 | 30 | return obj 31 | 32 | def query_task_obj_by_subtask(self, subtask_id): 33 | """ 34 | 通过子任务获取主任务模型对象 35 | :param subtask_id: 36 | :return: 37 | """ 38 | 39 | task_obj = self.db_session.query(self.model).select_from(self.model) \ 40 | .join(ScheduleTaskRecord, ScheduleTaskRecord.task_id == self.model.id) \ 41 | .join(SubTask, SubTask.schedule_task_id == ScheduleTaskRecord.id) \ 42 | .filter(SubTask.id == subtask_id) \ 43 | .first() 44 | 45 | return task_obj 46 | 47 | def query_url_count(self, task_id): 48 | """ 49 | 获取url总数 50 | :param task_id: 51 | :return: 52 | """ 53 | 54 | return self.db_session.query(TaskUrl).filter(TaskUrl.task_id == task_id).count() 55 | 56 | def query_crawl_url_count(self, task_id): 57 | """ 58 | 获取已爬取的url总数 59 | :param task_id: 60 | :return: 61 | """ 62 | 63 | return self.db_session.query(Result) \ 64 | .join(SubTask, Result.subtask_id == SubTask.id) \ 65 | .join(ScheduleTaskRecord, ScheduleTaskRecord.id == SubTask.schedule_task_id) \ 66 | .filter(ScheduleTaskRecord.task_id == task_id).count() 67 | 68 | def add_schedule_record(self, task_id, schedule_task_status, crawl_options): 69 | """ 70 | 增加调度记录 71 | :param task_id: 72 | :param schedule_task_status: 73 | :param crawl_options: 74 | :return: 75 | """ 76 | obj = ScheduleTaskRecord( 77 | task_id=task_id, 78 | crawl_options=crawl_options, 79 | schedule_task_status=schedule_task_status 80 | ) 81 | self.db_session.add(obj) 82 | self.safe_commit() 83 | return obj 84 | 85 | def query_task_loop_count(self, task_id): 86 | """ 87 | 获取任务已跑轮次 88 | :param task_id: 89 | :return: 90 | """ 91 | 92 | return self.db_session.query(ScheduleTaskRecord).filter(ScheduleTaskRecord.task_id == task_id).count() 93 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/task_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import TaskUrl, Url 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class TaskUrlModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = TaskUrl 11 | 12 | def create(self, task_id, urls_id): 13 | """ 14 | :return: 15 | """ 16 | self.db_session.add_all( 17 | [TaskUrl(task_id=task_id, url_id=url_id) for url_id in urls_id]) 18 | self.safe_commit() 19 | 20 | def create_subtask_url_mapping(self, chunk_url, subtask_id): 21 | """ 22 | 创建子任务与url映射关系 23 | :param chunk_url: 24 | :param subtask_id: 25 | :return: 26 | """ 27 | urls_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(chunk_url)).all() 28 | self.self_session.filter(self.model.url_id.in_([each[0] for each in urls_query])).update( 29 | {self.model.sub_task_id: subtask_id}, synchronize_session='fetch') 30 | self.safe_commit() 31 | return [{'url_id': each[0], 'url_address': each[1]} for each in urls_query] 32 | 33 | def query_urls_by_task_id(self, task_id): 34 | """ 35 | 根据task id查询关联的url 36 | :param task_id: 37 | :return: 38 | """ 39 | 40 | query = self.db_session.query(self.model.url_id, Url.address) \ 41 | .join(Url, Url.id == self.model.url_id) \ 42 | .filter(self.model.task_id == task_id) \ 43 | .all() 44 | return [{ 45 | 'url_id': each_obj[0], 'url_address': each_obj[1]} 46 | for each_obj in query 47 | ] 48 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/db_proxy/url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import Url 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class UrlModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = Url 11 | 12 | def create(self, urls): 13 | """ 14 | :return: 15 | """ 16 | 17 | # 检测系统中已存在的url 18 | exist_url_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(urls)).all() 19 | exist_urls_id = [each[0] for each in exist_url_query] 20 | 21 | # 创建在系统中不存在的url 22 | not_create_urls = set(urls).difference(set([each[1] for each in exist_url_query])) 23 | create_url_models = [Url(address=url) for url in not_create_urls] 24 | self.db_session.add_all(create_url_models) 25 | self.safe_commit() 26 | 27 | exist_urls_id.extend([each.id for each in create_url_models]) 28 | return exist_urls_id 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 存储结果 5 | """ 6 | 7 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text 8 | from sqlalchemy.dialects.postgresql import JSONB 9 | 10 | from webs.api.models import db 11 | 12 | 13 | class Result(db.Model): 14 | __tablename__ = 'results' 15 | 16 | id = Column(BigInteger, primary_key=True, autoincrement=True) 17 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id 18 | url_id = Column(Integer, nullable=False, index=True) # url id 19 | url_address = Column(String(1024), nullable=False) # url 地址 20 | http_code = Column(Integer) # 网站状态码 21 | title = Column(Text) # 网站标题 22 | content = Column(Text) # 网站内容 23 | text = Column(Text) # 网页正文 24 | current_url = Column(String(1024)) # 网站最后相应的地址 25 | redirect_chain = Column(JSONB) # 重定向链接 26 | response_headers = Column(JSONB) # response headers 27 | har_uuid = Column(String(128)) # 网站交互过程 28 | screenshot_id = Column(String(128)) # 截图Id 29 | cookies = Column(JSONB) # cookies 30 | finished_at = Column(TIMESTAMP) # 完成时间 31 | wappalyzer_results = Column(JSONB) # 网站指纹 32 | callback_failure_msg = Column(Text) # 回调错误信息 33 | favicon_md5 = Column(String(50)) # 网站图标hash值 34 | favicon_link = Column(String(1024)) # 网站图标链接 35 | response_time = Column(Integer) # 网站响应时间 36 | load_complete_time = Column(Integer) # 页面加载完成时间 37 | charset = Column(String(256)) # 网站编码 38 | 39 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 40 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 41 | 42 | def __repr__(self): 43 | return f'' 44 | 45 | def as_dict(self): 46 | from webs.api.models.db_proxy import task_model_proxy 47 | task_obj = task_model_proxy.query_task_obj_by_subtask(self.subtask_id) 48 | 49 | return { 50 | 'result_id': self.id, 51 | 'subtask_id': self.subtask_id, 52 | 'task_id': task_obj.id if task_obj else None, 53 | 'customer_id': task_obj.customer_id if task_obj else None, 54 | 'url_id': self.url_id, 55 | 'url_address': self.url_address, 56 | 'http_code': self.http_code, 57 | 'title': self.title, 58 | 'content': self.content, 59 | 'text': self.text, 60 | 'current_url': self.current_url, 61 | 'redirect_chain': self.redirect_chain, 62 | 'response_headers': self.response_headers, 63 | 'har_uuid': self.har_uuid, 64 | 'screenshot_id': self.screenshot_id, 65 | 'cookies': self.cookies, 66 | 'favicon_md5': self.favicon_md5, 67 | 'favicon_link': self.favicon_link, 68 | 'wappalyzer_results': self.wappalyzer_results, 69 | 'response_time': self.response_time, 70 | 'load_complete_time': self.load_complete_time, 71 | 'charset': self.charset, 72 | 'finished_at': self.finished_at.strftime("%Y-%m-%d %H:%M:%S") 73 | } 74 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | 爬虫节点 6 | """ 7 | 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Boolean, Integer, Float 9 | 10 | from webs.api.models import db 11 | 12 | 13 | class Server(db.Model): 14 | __tablename__ = 'servers' 15 | 16 | id = Column(BigInteger, primary_key=True, autoincrement=True) 17 | server_name = Column(String(128)) # 爬虫节点服务器名称 18 | server_address = Column(String(255), unique=True, nullable=True) # 服务器地址 ip:port 19 | enabled = Column(Boolean, server_default='t') # 是否启用 默认是 20 | status = Column(Boolean, server_default='t') # 服务器状态是否正常 默认是 21 | weight = Column(Integer, server_default='3') # 服务器权重 1,2,3,4,5 默认为3 22 | # load = Column(Integer, server_default='0') # 服务器负载,子服务器定时向主节点发送 23 | load = Column(String(20), server_default='0.1') # 服务器负载,子服务器定时向主节点发送 24 | spider_type = Column(String(20), server_default='splash') # 爬虫节点类型 splash/pyppeteer 25 | 26 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 27 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 28 | 29 | def __repr__(self): 30 | return f'' 31 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | 动态爬虫扫描任务模型 6 | """ 7 | from datetime import datetime 8 | 9 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY, Boolean, Text 10 | from sqlalchemy.dialects.postgresql import JSONB 11 | 12 | from webs.api.models import db 13 | 14 | 15 | class Task(db.Model): 16 | __tablename__ = 'tasks' 17 | 18 | id = Column(BigInteger, primary_key=True, autoincrement=True) 19 | customer_id = Column(String(255), index=True) # 纯粹用来作为API调用标识,API 返回时被原样返回,以方便 API 调用方匹配请求与返回。 20 | task_name = Column(String(255)) # 任务名称 21 | task_status = Column(String(255)) # task任务状态 22 | finished = Column(Boolean, server_default='f') # 任务是否已完成 23 | schedule_options = Column(JSONB) # 周期调度相关参数 24 | crawl_options = Column(JSONB) # 爬取相关参数 25 | extra_data = Column(Text) # 客户端额外数据 26 | 27 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 28 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 29 | 30 | def __repr__(self): 31 | return f'' 32 | 33 | def as_dict(self, **kwargs): 34 | return { 35 | 'task_id': self.id, 36 | 'customer_id': self.customer_id, 37 | 'task_name': self.task_name, 38 | 'task_status': self.task_status, 39 | 'finished': self.finished, 40 | 'crawl_options': self.crawl_options, 41 | 'schedule_options': self.schedule_options, 42 | 'extra_data': self.extra_data, 43 | 'create_time': self.create_time.strftime("%Y-%m-%d %H:%M:%S"), 44 | 'update_time': self.update_time.strftime("%Y-%m-%d %H:%M:%S") 45 | } if not kwargs.get('fields') else { 46 | f: getattr(self, f, None) if not isinstance(getattr(self, f), datetime) 47 | else getattr(self, f).strftime("%Y-%m-%d %H:%M:%S") 48 | for f in kwargs['fields'] if f in self.__table__.columns 49 | } 50 | 51 | 52 | class SubTask(db.Model): 53 | __tablename__ = 'sub_tasks' 54 | 55 | id = Column(BigInteger, primary_key=True, autoincrement=True) 56 | schedule_task_id = Column(Integer, nullable=False, index=True) # 所属某次调度任务id 57 | server_id = Column(Integer, nullable=False) # 此子任务关联的服务器节点 58 | assigned_urls = Column(ARRAY(String)) # 此子任务所分配的url 59 | delivery_failure_msg = Column(Text) # 发送失败原因 60 | finished = Column(Boolean, server_default='f') # 是否已完成 61 | finished_at = Column(TIMESTAMP) # 完成时间 62 | 63 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 64 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 65 | 66 | def __repr__(self): 67 | return f'' 68 | 69 | 70 | class ScheduleTaskRecord(db.Model): 71 | __tablename__ = 'schedule_task_records' 72 | 73 | id = Column(BigInteger, primary_key=True, autoincrement=True) 74 | task_id = Column(Integer, nullable=False, index=True) # 所属任务id 75 | schedule_task_status = Column(String(255)) # task任务状态 76 | finished = Column(Boolean, server_default='f') # 此次任务是否已完成 77 | crawl_options = Column(JSONB) # 此次任务所使用的爬取参数 78 | 79 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) # 调度任务创建时间 80 | start_time = Column(TIMESTAMP) # 此次调度任务真正开始执行时间 81 | finished_time = Column(TIMESTAMP) # 任务完成时间 82 | 83 | def __repr__(self): 84 | return f'' 85 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/task_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | 任务与url映射关系 5 | """ 6 | 7 | from sqlalchemy import Column, BigInteger, TIMESTAMP, func 8 | 9 | from webs.api.models import db 10 | 11 | 12 | class TaskUrl(db.Model): 13 | __tablename__ = 'task_url' 14 | 15 | id = Column(BigInteger, primary_key=True, autoincrement=True) 16 | task_id = Column(BigInteger, nullable=True, index=True) 17 | # sub_task_id = Column(BigInteger, index=True) # 子任务id 18 | url_id = Column(BigInteger, nullable=True, index=True) 19 | 20 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 21 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 22 | 23 | def __repr__(self): 24 | return f'' 25 | -------------------------------------------------------------------------------- /services/engine/webs/api/models/url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | Url模型 6 | """ 7 | 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func 9 | 10 | from webs.api.models import db 11 | 12 | 13 | class Url(db.Model): 14 | __tablename__ = 'urls' 15 | 16 | id = Column(BigInteger, primary_key=True, autoincrement=True) 17 | address = Column(String(1024), unique=True, index=True) 18 | 19 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 20 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 21 | 22 | def __repr__(self): 23 | return f'' 24 | -------------------------------------------------------------------------------- /services/engine/webs/api/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from datetime import datetime 4 | 5 | from webs.api.models.db_proxy import task_model_proxy 6 | from webs.api.exceptions.customs import InvalidAPIRequest, RecordNotFound 7 | from webs.api.utils.helper import nowstr, today 8 | 9 | 10 | class LengthChecker(object): 11 | """字段长度校验""" 12 | 13 | def __init__(self, sign, length): 14 | self.sign = sign 15 | self.length = length 16 | 17 | def __call__(self, verified): 18 | if verified is not None and len(verified) > self.length: 19 | raise InvalidAPIRequest(f'{self.sign}长度过长!') 20 | 21 | 22 | class OneOf(object): 23 | """Validator which succeeds if ``value`` is a member of ``choices``""" 24 | 25 | def __init__(self, choices): 26 | self.choices = choices 27 | 28 | def __call__(self, verified): 29 | if verified not in self.choices: 30 | raise InvalidAPIRequest(f'请选择{self.choices}其中之一!') 31 | 32 | 33 | class TaskValidator(object): 34 | """主任务验证器""" 35 | 36 | def __init__(self): 37 | self.url_pattern = r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?' 38 | 39 | def url_valid(self, url): 40 | if not re.match(self.url_pattern, url): 41 | raise InvalidAPIRequest(f'{url}不是一个合法的Url!') 42 | 43 | def task_id_exist_valid(self, task_id): 44 | if not task_model_proxy.find(id=task_id): 45 | raise RecordNotFound('该任务不存在!') 46 | 47 | @staticmethod 48 | def schedule_valid(kwargs): 49 | """验证周期性调度相关参数""" 50 | schedule_type, schedule_data = kwargs.get('schedule_type'), kwargs.get('schedule_data') 51 | 52 | # 验证定时执行 53 | if schedule_type == 'datetime': 54 | if len(schedule_data) > 1: 55 | raise InvalidAPIRequest('无效的执行时间!') 56 | run_date = schedule_data.get('run_date') 57 | if not run_date: 58 | raise InvalidAPIRequest('无效的执行时间!') 59 | # 和当前时间比较 60 | if run_date <= nowstr(): 61 | raise InvalidAPIRequest('执行时间不能小于当前时间!') 62 | 63 | # 验证间隔执行和周期调度 64 | elif schedule_type in ('interval', 'cron'): 65 | if not schedule_data.get('start_date') or not schedule_data.get('end_date'): 66 | raise InvalidAPIRequest('请输入开始时间和结束时间!') 67 | 68 | interval_effective_params = { 69 | 'weeks', 'days', 'hours', 'minutes', 'seconds', 70 | 'start_date', 'end_date', 'max_instances' 71 | } 72 | cron_effective_params = { 73 | 'week', 'day', 'hour', 'minute', 74 | 'second', 'year', 'month', 75 | 'day_of_week', 'max_instances', 76 | 'start_date', 'end_date' 77 | } 78 | 79 | if (schedule_type == 'cron' and set(schedule_data.keys()).difference(cron_effective_params)) or ( 80 | schedule_type == 'interval' and set(schedule_data.keys()).difference(interval_effective_params)): 81 | raise InvalidAPIRequest('无效的调度参数!') 82 | 83 | if not set(schedule_data.keys()).difference({'start_date', 'end_date'}): 84 | raise InvalidAPIRequest('请输入正确的调度参数!') 85 | 86 | if schedule_data.get('start_date') >= schedule_data.get('end_date'): 87 | raise InvalidAPIRequest('开始时间不能大于结束时间!') 88 | 89 | if schedule_data.get('end_date') < today(): 90 | raise InvalidAPIRequest('结束时间不能小于当前时间!') 91 | 92 | 93 | class TimeValidator(object): 94 | 95 | def __init__(self, s=None, e=None): 96 | self.s = s 97 | self.e = e 98 | 99 | @staticmethod 100 | def date_or_datetime_valid(_time): 101 | try: 102 | datetime.strptime(_time, "%Y-%m-%d") 103 | return 104 | except (ValueError, AttributeError) as e: 105 | pass 106 | try: 107 | datetime.strptime(_time, "%Y-%m-%d %H:%M:%S") 108 | return 109 | except (ValueError, AttributeError) as e: 110 | pass 111 | raise InvalidAPIRequest('请输入正确的日期时间!') 112 | 113 | def __call__(self, time_field): 114 | if not self.s <= time_field <= self.e: 115 | raise InvalidAPIRequest('请输入正确的时间范围!') 116 | 117 | 118 | task_validator = TaskValidator() 119 | time_validator = TimeValidator() 120 | -------------------------------------------------------------------------------- /services/engine/webs/api/schemas/results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from webargs import fields 3 | 4 | result_by_url_schema = { 5 | "url": fields.Url(required=True), 6 | "fields": fields.DelimitedList(fields.Str(), missing=[]) 7 | } 8 | 9 | result_by_id_schema = { 10 | "result_id": fields.Int(required=True) 11 | } 12 | 13 | get_screenshot_schema = { 14 | 'screenshot_id': fields.Str(required=True) 15 | } 16 | 17 | download_har_file_schema = { 18 | 'har_uuid': fields.Str(required=True) 19 | } 20 | 21 | get_favicon_schema = { 22 | 'favicon_md5': fields.Str(required=True) 23 | } 24 | 25 | get_small_schema = { 26 | **get_screenshot_schema, 27 | 'wide': fields.Int(missing=272), 28 | 'high': fields.Int(missing=165) 29 | } 30 | -------------------------------------------------------------------------------- /services/engine/webs/api/schemas/tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from webargs import fields 5 | 6 | from webs.api.schemas import LengthChecker, OneOf, task_validator, TimeValidator as TimeRangeValid, time_validator 7 | 8 | schedule_data = { 9 | 'run_date': fields.Str(validate=time_validator.date_or_datetime_valid), # 定时执行时间 10 | 'year': fields.Int(validate=TimeRangeValid(2021, 2999)), 11 | 'month': fields.Int(validate=TimeRangeValid(1, 12)), 12 | 'day': fields.Int(validate=TimeRangeValid(1, 31)), 13 | 'week': fields.Int(validate=TimeRangeValid(1, 53)), 14 | 'day_of_week': fields.Int(validate=TimeRangeValid(0, 6)), 15 | 'hour': fields.Int(validate=TimeRangeValid(0, 23)), 16 | 'minute': fields.Int(validate=TimeRangeValid(0, 59)), 17 | 'second': fields.Int(validate=TimeRangeValid(0, 59)), 18 | 'weeks': fields.Int(), 19 | 'days': fields.Int(), 20 | 'hours': fields.Int(), 21 | 'minutes': fields.Int(), 22 | 'seconds': fields.Int(), 23 | 'start_date': fields.Str(validate=time_validator.date_or_datetime_valid), 24 | 'end_date': fields.Str(validate=time_validator.date_or_datetime_valid), 25 | 'max_instances': fields.Int(missing=1) 26 | } 27 | 28 | crawl_options = { 29 | 'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])), 30 | 'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])), # 任务优先级 31 | 'headless': fields.Bool(missing=False), # 有头/无头模式 默认使用有头模式 32 | 'debug': fields.Bool(missing=False), # 是否开启调试模式, 33 | 'referer': fields.Str(), # 网站来路地址 34 | 'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])), # 并发数 35 | 'url_timeout': fields.Int(missing=30), # 单个url超时时间 36 | 'enabled_render_js': fields.Bool(missing=True), 37 | 'page_wait_time': fields.Int(missing=3), # 等待页面js渲染时间 38 | 'ignore_ssl': fields.Bool(missing=True), # 是否忽略证书错误 39 | 'screenshot': fields.Bool(missing=False), # 是否截图 40 | 'proxy_url': fields.Str(), # 代理 41 | 'user_agent': fields.Str(), # Ua 42 | 'record_har': fields.Bool(missing=False), # 请求networks 43 | 'record_redirect': fields.Bool(missing=False), # 是否记录重定向链接 44 | 'use_browser_cache': fields.Bool(missing=True), # 是否使用浏览器缓存 45 | 'use_result_cache': fields.Bool(missing=True), # 是否使用结果缓存 46 | 'wappalyzer': fields.Bool(missing=False), # 是否使用指纹识别 47 | 'extract_text': fields.Bool(missing=True), # 是否提取网页正文 48 | 'extract_favicon': fields.Bool(missing=False), # 是否下载网站图标 49 | 'callback_type': fields.Str(validate=OneOf(choices=['http', 'rpc'])), 50 | 'callback_address': fields.Str(), 51 | 'wait_until': fields.Str( 52 | missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])), # 控制页面何时加载成功, 53 | 'rpc_server': fields.Str(missing=os.getenv('LOCAL_RPC_SERVER_ADDRESS')) 54 | } 55 | 56 | create_task_schema = { 57 | 'customer_id': fields.Str(validate=LengthChecker(sign='自定义id', length=255)), 58 | 'task_name': fields.Str(validate=LengthChecker(sign='任务名称', length=255)), 59 | 'urls': fields.DelimitedList(fields.Str(validate=task_validator.url_valid), required=True), 60 | 'schedule_type': fields.Str(missing='instantly', validate=OneOf(['instantly', 'datetime', 'interval', 'cron'])), 61 | 'schedule_data': fields.Nested(schedule_data, missing={}), 62 | 'crawl_options': fields.Nested(crawl_options, missing={}), 63 | 'extra_data': fields.Str(), 64 | } 65 | 66 | task_id_schema = { 67 | 'task_id': fields.Int(required=True, validate=task_validator.task_id_exist_valid) 68 | } 69 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | 5 | 6 | def now(): 7 | return datetime.now() 8 | 9 | 10 | def nowstr(): 11 | return now().strftime('%Y-%m-%d %H:%M:%S') 12 | 13 | 14 | def today(): 15 | return now().strftime('%Y-%m-%d') 16 | 17 | 18 | def add_spider_server(address): 19 | """添加爬虫服务地址""" 20 | from webs.api.models.db_proxy import server_model_proxy 21 | server_model_proxy.add_server(address) 22 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/loggers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | import socket 6 | import sys 7 | import traceback 8 | from datetime import datetime 9 | 10 | try: 11 | import simplejson as json 12 | except ImportError: 13 | import json 14 | 15 | 16 | class JSONFormatter(logging.Formatter): 17 | """ 18 | JSON formatter for python logging 19 | 20 | You can pass additional tags on a per message basis using the 21 | key "tags" in the extra parameter. 22 | eg: logger.error('hello world!', extra={"tags": ["hello=world"]}) 23 | """ 24 | 25 | def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON', 26 | indent=None): 27 | """ 28 | :param tags: a list of tags to add to every messages 29 | :hostname: force a specific hostname 30 | :fqdn: a boolean to use the FQDN instead of the machine's hostname 31 | :message_type: the message type for Logstash formatters 32 | :indent: indent level of the JSON output 33 | """ 34 | self.message_type = message_type 35 | self.tags = tags if tags is not None else [] 36 | self.extra_tags = [] 37 | self.indent = indent 38 | 39 | if hostname: 40 | self.host = hostname 41 | elif fqdn: 42 | self.host = socket.getfqdn() 43 | else: 44 | self.host = socket.gethostname() 45 | 46 | def get_extra_fields(self, record): 47 | # The list contains all the attributes listed in 48 | # http://docs.python.org/library/logging.html#logrecord-attributes 49 | skip_list = [ 50 | 'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args', 51 | 'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg', 52 | 'msecs', 'msecs', 'message', 'name', 'pathname', 'process', 53 | 'processName', 'relativeCreated', 'thread', 'threadName', 'extra'] 54 | 55 | if sys.version_info < (3, 0): 56 | easy_types = (str, bool, dict, float, int, list, type(None)) 57 | else: 58 | easy_types = (str, bool, dict, float, int, list, type(None)) 59 | 60 | fields = {} 61 | 62 | self.extra_tags = [] 63 | for key, value in record.__dict__.items(): 64 | if key not in skip_list: 65 | if key == 'tags' and isinstance(value, list): 66 | self.extra_tags = value 67 | elif isinstance(value, easy_types): 68 | fields[key] = value if value else "null" 69 | else: 70 | fields[key] = repr(value) 71 | 72 | return fields 73 | 74 | def get_debug_fields(self, record): 75 | if record.exc_info: 76 | exc_info = self.format_exception(record.exc_info) 77 | else: 78 | exc_info = record.exc_text 79 | return { 80 | 'exc_info': exc_info, 81 | 'filename': record.filename, 82 | 'lineno': record.lineno, 83 | } 84 | 85 | @classmethod 86 | def format_source(cls, message_type, host, path): 87 | return "%s://%s/%s" % (message_type, host, path) 88 | 89 | @classmethod 90 | def format_timestamp(cls, time): 91 | return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X")) 92 | 93 | @classmethod 94 | def format_exception(cls, exc_info): 95 | return ''.join(traceback.format_exception(*exc_info)) if exc_info else '' 96 | 97 | @classmethod 98 | def serialize(cls, message, indent=None): 99 | return json.dumps(message, ensure_ascii=False, indent=indent) 100 | 101 | def format(self, record, serialize=True): 102 | old_message = record.getMessage() 103 | try: 104 | new_message = json.loads(old_message) 105 | except json.decoder.JSONDecodeError as e: 106 | message = old_message.replace("'", '"') 107 | new_message = json.loads(message) 108 | except Exception: 109 | new_message = record.getMessage() 110 | # Create message dict 111 | message = { 112 | 'timestamp': self.format_timestamp(record.created), 113 | 'app': os.environ.get('APP_NAME'), 114 | 'host': self.host, 115 | 'environment': os.environ.get('FLASK_ENV'), 116 | 'logger': record.name, 117 | 'level': record.levelname, 118 | 'messages': new_message, 119 | 'path': record.pathname, 120 | 'tags': self.tags[:] 121 | } 122 | 123 | # Add extra fields 124 | message.update(self.get_extra_fields(record)) 125 | 126 | # Add extra tags 127 | if self.extra_tags: 128 | message['tags'].extend(self.extra_tags) 129 | 130 | # If exception, add debug info 131 | if record.exc_info or record.exc_text: 132 | message.update(self.get_debug_fields(record)) 133 | 134 | if serialize: 135 | return self.serialize(message, indent=self.indent) 136 | return message 137 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from flask import current_app, request 5 | from sqlalchemy.exc import DatabaseError 6 | from webs.api.exceptions.customs import InvalidContentType 7 | from webs.api.models import db 8 | 9 | ACL_ORIGIN = 'Access-Control-Allow-Origin' 10 | ACL_METHODS = 'Access-Control-Allow-Methods' 11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers' 12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials' 13 | ACL_CACHE_CONTROL = 'Cache-Control' 14 | 15 | GET_METHOD = 'GET' 16 | OPTIONS_METHOD = 'OPTIONS' 17 | ALLOWED_ORIGINS = '*' 18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS' 19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \ 20 | 'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type' 21 | ALLOWED_CREDENTIALS = 'true' # Allow send cookie 22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate' 23 | 24 | 25 | def before_request_middleware(app): 26 | app.before_request_funcs.setdefault(None, [ 27 | ensure_request_log, 28 | ensure_content_type, 29 | ]) 30 | 31 | 32 | def after_request_middleware(app): 33 | app.after_request_funcs.setdefault(None, [ 34 | enable_cors, 35 | commit_session, 36 | ]) 37 | 38 | 39 | def teardown_appcontext_middleware(app): 40 | app.teardown_appcontext_funcs = [ 41 | shutdown_session, 42 | ] 43 | 44 | 45 | def ensure_request_log(): 46 | """当为生产环境时,屏蔽中间件日志记录器""" 47 | if current_app.debug: 48 | current_app.logger.info( 49 | "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || " 50 | "Parameters: {param}".format( 51 | time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 52 | client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr), 53 | path=request.full_path, 54 | param=request.data.decode('utf-8'))) 55 | 56 | 57 | def ensure_content_type(): 58 | """ 59 | Ensures that the Content-Type for all requests 60 | is `application-json` or `multipart/form-data`, otherwise appropriate error 61 | is raised. 62 | :raises: InvalidContentType if Content-Type is not `application-json` 63 | or not `multipart/form-data` 64 | """ 65 | 66 | content_type = request.headers.get('Content-Type') 67 | if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \ 68 | (not content_type or not ('application/json' in content_type or 69 | 'multipart/form-data' in content_type)): 70 | raise InvalidContentType( 71 | message='Invalid Content-Type. ' 72 | 'Only `application/json` or `multipart/form-data` is allowed') 73 | 74 | 75 | def enable_cors(response): 76 | """ 77 | Enable Cross-origin resource sharing. 78 | These headers are needed for the clients that 79 | will consume the API via AJAX requests. 80 | """ 81 | if request.method == OPTIONS_METHOD: 82 | response = current_app.make_default_options_response() 83 | response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS 84 | response.headers[ACL_METHODS] = ALLOWED_METHODS 85 | response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS 86 | response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL 87 | 88 | return response 89 | 90 | 91 | def commit_session(response): 92 | """ 93 | Try to commit the db session in the case 94 | of a successful request with status_code 95 | under 400. 96 | """ 97 | if response.status_code >= 400: 98 | return response 99 | try: 100 | db.session.commit() 101 | except DatabaseError: 102 | db.session.rollback() 103 | return response 104 | 105 | 106 | def shutdown_session(exception=None): 107 | """ 108 | Remove the db session and detach from the 109 | database driver after application shutdown. 110 | """ 111 | db.session.remove() 112 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/routers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pkgutil 4 | 5 | 6 | def register_routes(app): 7 | """Register routes.""" 8 | from .. import views 9 | from flask.blueprints import Blueprint 10 | 11 | for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."): 12 | blueprint_name = name.split('.')[-1] 13 | modules = __import__(name, fromlist="dummy") 14 | blueprint = getattr(modules, blueprint_name) 15 | if isinstance(blueprint, Blueprint): 16 | app.register_blueprint(blueprint) 17 | -------------------------------------------------------------------------------- /services/engine/webs/api/utils/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask_migrate import Migrate 4 | 5 | from webs.api.models import db, redis_store 6 | 7 | 8 | def init_db(app): 9 | """ 10 | Create database if doesn't exist and 11 | create all tables. 12 | """ 13 | 14 | # 初始化pg 15 | db.init_app(app) 16 | migrate = Migrate(compare_type=True, compare_server_default=True) 17 | migrate.init_app(app, db) 18 | 19 | # 初始化Redis 20 | redis_store.init_app(app) 21 | 22 | return db 23 | -------------------------------------------------------------------------------- /services/engine/webs/api/views/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/api/views/ping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask import Blueprint, jsonify 4 | 5 | ping = Blueprint('ping', __name__) 6 | 7 | 8 | @ping.route('/ping') 9 | def ping_pong(): 10 | """ 11 | 测试服务是否可用 12 | """ 13 | return jsonify({ 14 | "data": "pong", 15 | "status": True 16 | }) 17 | -------------------------------------------------------------------------------- /services/engine/webs/api/views/results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask import Blueprint, jsonify 4 | from webargs.flaskparser import use_args 5 | 6 | from webs.api.bizs.result import ResultBiz 7 | from webs.api.schemas.results import result_by_url_schema, result_by_id_schema, get_screenshot_schema, \ 8 | download_har_file_schema, get_favicon_schema, get_small_schema 9 | 10 | results = Blueprint('results', __name__, url_prefix='/results') 11 | 12 | 13 | @results.route('/get-by-url') 14 | @use_args(result_by_url_schema, locations=('query',)) 15 | def result_by_url(args): 16 | """ 17 | 根据url查询结果 18 | :param args: 19 | :return: 20 | """ 21 | 22 | result_biz = ResultBiz() 23 | return jsonify({ 24 | 'status': True, 25 | 'data': result_biz.result_by_url(args['url'], args['fields']) 26 | }), 200 27 | 28 | 29 | @results.route('/get-by-id') 30 | @use_args(result_by_id_schema, locations=('query',)) 31 | def result_by_id(args): 32 | """ 33 | 根据id查询结果 34 | :param args: 35 | :return: 36 | """ 37 | 38 | result_biz = ResultBiz() 39 | return jsonify({ 40 | 'status': True, 41 | 'data': result_biz.result_by_id(args['result_id']) 42 | }), 200 43 | 44 | 45 | @results.route('/screenshot') 46 | @use_args(get_screenshot_schema, locations=('query',)) 47 | def get_screenshot(args): 48 | """ 49 | 获取图片 50 | :param args: 51 | :return: 52 | """ 53 | 54 | result_biz = ResultBiz() 55 | return result_biz.get_screenshot(args['screenshot_id']) 56 | 57 | 58 | @results.route('/screenshot/encode') 59 | @use_args(get_screenshot_schema, locations=('query',)) 60 | def get_screenshot_base64(args): 61 | """ 62 | 获取图片base64编码 63 | :param args: 64 | :return: 65 | """ 66 | 67 | result_biz = ResultBiz() 68 | return jsonify({ 69 | 'status': True, 70 | 'data': result_biz.get_screenshot_base64_encode(args['screenshot_id']) 71 | }), 200 72 | 73 | 74 | @results.route('/screenshot/download') 75 | @use_args(get_screenshot_schema, locations=('query',)) 76 | def download_screenshot(args): 77 | """ 78 | 下载图片 79 | :param args: 80 | :return: 81 | """ 82 | result_biz = ResultBiz() 83 | return result_biz.get_screenshot(args['screenshot_id'], download=True) 84 | 85 | 86 | @results.route('/screenshot/small') 87 | @use_args(get_small_schema, locations=('query',)) 88 | def small_screenshot(args): 89 | """ 90 | 查看图片缩略图 91 | :param args: 92 | :return: 93 | """ 94 | result_biz = ResultBiz() 95 | return result_biz.get_small_screenshot(**args) 96 | 97 | 98 | @results.route('/har/download') 99 | @use_args(download_har_file_schema, locations=('query',)) 100 | def download_har_file(args): 101 | """ 102 | 下载har文件 103 | :param args: 104 | :return: 105 | """ 106 | 107 | result_biz = ResultBiz() 108 | return result_biz.download_har(args['har_uuid']) 109 | 110 | 111 | @results.route('/favicon') 112 | @use_args(get_favicon_schema, locations=('query',)) 113 | def get_favicon(args): 114 | """ 115 | 查看网站图标 116 | :param args: 117 | :return: 118 | """ 119 | result_biz = ResultBiz() 120 | return result_biz.get_favicon(args['favicon_md5']) 121 | 122 | 123 | @results.route('/favicon/download') 124 | @use_args(get_favicon_schema, locations=('query',)) 125 | def download_favicon(args): 126 | """ 127 | 下载网站图标 128 | :param args: 129 | :return: 130 | """ 131 | result_biz = ResultBiz() 132 | return result_biz.get_favicon(args['favicon_md5'], download=True) 133 | -------------------------------------------------------------------------------- /services/engine/webs/api/views/tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask import Blueprint, jsonify 4 | from webargs.flaskparser import use_args 5 | 6 | from webs.api.bizs.task import TaskBiz 7 | from webs.api.schemas import task_validator 8 | from webs.api.schemas.tasks import create_task_schema, task_id_schema 9 | 10 | tasks = Blueprint('tasks', __name__, url_prefix='/tasks') 11 | 12 | 13 | @tasks.route('', methods=['POST']) 14 | @use_args(create_task_schema, locations=('json',), validate=task_validator.schedule_valid) 15 | def create_task(args): 16 | """ 17 | 创建爬虫任务 18 | :param args: 19 | :return: 20 | """ 21 | 22 | task_biz = TaskBiz() 23 | data = task_biz.create_task(**args) 24 | 25 | return jsonify({ 26 | 'status': True, 27 | 'data': data 28 | }), 201 29 | 30 | 31 | @tasks.route('', methods=['DELETE']) 32 | @use_args(task_id_schema, locations=('json',)) 33 | def delete_task(args): 34 | """ 35 | 删除爬虫任务 36 | :param args: 37 | :return: 38 | """ 39 | 40 | task_biz = TaskBiz() 41 | task_biz.delete_task(args['task_id']) 42 | 43 | return jsonify({ 44 | 'status': True 45 | }), 204 46 | 47 | 48 | @tasks.route('/status') 49 | @use_args(task_id_schema, locations=('query',)) 50 | def get_task_status(args): 51 | """ 52 | 查询任务进度 53 | :param args: 54 | :return: 55 | """ 56 | 57 | task_biz = TaskBiz() 58 | return jsonify({ 59 | 'status': True, 60 | 'data': task_biz.get_task_status(**args) 61 | }), 200 62 | 63 | 64 | @tasks.route('/pause', methods=['PATCH']) 65 | @use_args(task_id_schema, locations=('json',)) 66 | def pause_task(args): 67 | """ 68 | 暂停调度任务 69 | :param args: 70 | :return: 71 | """ 72 | 73 | task_biz = TaskBiz() 74 | task_biz.pause_task(args['task_id']) 75 | 76 | return jsonify({ 77 | 'status': True, 78 | }), 200 79 | 80 | 81 | @tasks.route('/resume', methods=['PATCH']) 82 | @use_args(task_id_schema, locations=('json',)) 83 | def resume_task(args): 84 | """ 85 | 恢复调度任务 86 | :param args: 87 | :return: 88 | """ 89 | 90 | task_biz = TaskBiz() 91 | task_biz.resume_task(args['task_id']) 92 | 93 | return jsonify({ 94 | 'status': True, 95 | }), 200 96 | 97 | 98 | @tasks.route('/redelivery', methods=['POST']) 99 | @use_args(task_id_schema, locations=('json',)) 100 | def redelivery(args): 101 | """ 102 | 重新下发 103 | """ 104 | 105 | task_biz = TaskBiz() 106 | task_biz.redelivery(args['task_id']) 107 | return jsonify({'status': True}), 200 108 | -------------------------------------------------------------------------------- /services/engine/webs/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import os 5 | 6 | 7 | class BaseConfig: 8 | """Base configuration""" 9 | 10 | # Root path of project 11 | PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 12 | 13 | DEBUG = True 14 | TESTING = False 15 | SQLALCHEMY_TRACK_MODIFICATIONS = False 16 | SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True} 17 | SECRET_KEY = os.environ.get('SECRET_KEY') 18 | 19 | # Redis configuration 20 | REDIS_URL = os.environ.get('REDIS_URL') 21 | 22 | 23 | class DevelopmentConfig(BaseConfig): 24 | """Development configuration""" 25 | 26 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') 27 | 28 | 29 | class ProductionConfig(BaseConfig): 30 | """Production configuration""" 31 | 32 | DEBUG = False 33 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') 34 | -------------------------------------------------------------------------------- /services/engine/webs/core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/webs/core/requests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .request import web_client 4 | -------------------------------------------------------------------------------- /services/engine/webs/core/requests/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import requests 5 | 6 | from webs.api.exceptions.customs import ServerError, InvalidAPIRequest, RecordNotFound, RecordAlreadyExists 7 | 8 | 9 | class RequestMixin(object): 10 | CODE_EXCEPTION_MSG = { 11 | 400: InvalidAPIRequest, 12 | 404: RecordNotFound, 13 | 409: RecordAlreadyExists, 14 | 422: InvalidAPIRequest, 15 | 500: ServerError, 16 | } 17 | 18 | def __init__(self): 19 | self.session = requests.Session() 20 | 21 | @property 22 | def _headers(self): 23 | return { 24 | "Content-Type": "application/json", 25 | } 26 | 27 | def request(self, server, method, url, json=None, params=None, timeout=60): 28 | try: 29 | response = self.session.request( 30 | method, url, json=json, params=params, 31 | timeout=timeout, headers=self._headers 32 | ) 33 | except requests.exceptions.ConnectTimeout: 34 | raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接超时!") 35 | except requests.exceptions.ConnectionError: 36 | raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接错误!") 37 | 38 | try: 39 | response_data = response.json() 40 | except Exception as e: 41 | raise ServerError(f"{server}服务器参数解析失败!") 42 | 43 | if not (200 <= response.status_code < 300): 44 | exception = self.CODE_EXCEPTION_MSG[response.status_code] \ 45 | if response.status_code in self.CODE_EXCEPTION_MSG else self.CODE_EXCEPTION_MSG[400] 46 | raise exception(f"{server} Response:{response_data.get('error').get('message')}") 47 | 48 | return response_data 49 | 50 | 51 | web_client = RequestMixin() 52 | -------------------------------------------------------------------------------- /services/engine/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | from celery import Celery 5 | 6 | ################## 7 | # Celery配置 8 | from webs import create_app 9 | 10 | 11 | class CeleryConfig(object): 12 | # 任务与劣化为json,从Celery4.0开始,默认序列化器将为json 13 | task_serializer = 'json' 14 | 15 | # 结果序列化为json 16 | result_serializer = 'json' 17 | 18 | # 定时任务过期时间 19 | result_expires = 60 * 60 * 24 20 | 21 | # 允许接收的任务类型 22 | accept_content = ["json"] 23 | 24 | # 每个进程预取任务数 25 | worker_prefetch_multiplier = 1 26 | 27 | # 每个worker执行200个任务就销毁重启 28 | worker_max_tasks_per_child = 200 29 | 30 | # 时区设置 31 | timezone = 'Asia/Shanghai' 32 | enable_utc = True 33 | 34 | 35 | ################## 36 | # 初始化celery worker 37 | def init_celery(app=None, celery_type='usual'): 38 | app = app or create_app() 39 | celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL')) 40 | celery_app.config_from_object(CeleryConfig) 41 | 42 | # 导入相关任务模块 43 | if celery_type == 'usual': 44 | celery_app.conf.update(imports=['worker.engine', 'worker.result']) 45 | elif celery_type == 'beat': 46 | pass 47 | # celery_app.conf.update( 48 | # imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron']) 49 | # celery_app.conf.update( 50 | # CELERYBEAT_SCHEDULE={ 51 | # } 52 | # ) 53 | 54 | # 在flask上下文中执行 55 | class ContextTask(celery_app.Task): 56 | """Make celery tasks work with Flask app context""" 57 | 58 | def __call__(self, *args, **kwargs): 59 | with app.app_context(): 60 | return self.run(*args, **kwargs) 61 | 62 | celery_app.Task = ContextTask 63 | return celery_app 64 | 65 | 66 | celery_app = init_celery() 67 | # beat_app = init_celery(celery_type='beat') 68 | -------------------------------------------------------------------------------- /services/engine/worker/engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.exceptions.customs import RecordNotFound 4 | from webs.api.models.db_proxy import server_model_proxy, task_model_proxy, subtask_model_proxy, url_model_proxy, \ 5 | schedule_task_proxy, task_url_model_proxy, result_model_proxy 6 | from worker import celery_app 7 | from worker.library.helper import split_urls, send, WeightedRandomGenerator, remove_files 8 | 9 | 10 | @celery_app.task(name='delivery_task') 11 | def delivery_task(task_id): 12 | """ 13 | 下发任务 14 | :param task_id: 15 | :return: 16 | """ 17 | 18 | task_obj = task_model_proxy.find(id=task_id) 19 | if not task_obj: 20 | return 21 | 22 | # 增加爬虫任务调度记录 23 | schedule_task_obj = task_model_proxy.add_schedule_record( 24 | task_id=task_id, schedule_task_status="start_delivery", crawl_options=task_obj.crawl_options) 25 | schedule_task_id = schedule_task_obj.id 26 | 27 | # 查询待爬取的url struct列表 28 | urls_struct = task_url_model_proxy.query_urls_by_task_id(task_id) 29 | 30 | # url分块处理 31 | chunk_urls_struct = split_urls(urls_struct) 32 | 33 | # 获取爬虫节点服务器配额 34 | servers_info = server_model_proxy.query_servers_by_score(sort='desc') 35 | if not servers_info: 36 | task_model_proxy.set_many_attr(obj_id=task_id, fields_v={'task_status': 'No server found!', 'finished': True}) 37 | raise RecordNotFound('No server found!') 38 | 39 | ###### 投递子任务 40 | # 当url分块数小于服务器节点数时采用轮询算法 41 | if len(chunk_urls_struct) <= len(servers_info): 42 | for index, chunk_url_struct in enumerate(chunk_urls_struct): 43 | send(schedule_task_id, chunk_url_struct, servers_info[index], task_obj.crawl_options) 44 | 45 | # 否则使用加权随机算法 46 | else: 47 | server_seeds = WeightedRandomGenerator(servers_info) 48 | for chunk_url_struct in chunk_urls_struct: 49 | send(schedule_task_id, chunk_url_struct, server_seeds.spawn(), task_obj.crawl_options) 50 | 51 | ###### 根据子任务发送情况设置主任务状态 52 | # 查询子任务投递失败数 53 | failure_count = subtask_model_proxy.query_delivery_failure_count(schedule_task_id) 54 | # 如果子任务投递全部失败,则设置当前调度任务为投递失败状态 55 | if failure_count == len(chunk_urls_struct): 56 | schedule_task_proxy.set_many_attr( 57 | obj_id=schedule_task_id, fields_v={'schedule_task_status': 'delivery_failure', 'finished': True} 58 | ) 59 | # 如果是临时任务,则直接标记主任务为失败状态 60 | if task_obj.schedule_options.get('schedule_type') == 'instantly': 61 | task_model_proxy.set_many_attr( 62 | obj=task_obj, fields_v={'task_status': 'delivery_failure', 'finished': True} 63 | ) 64 | 65 | # 只要有一个投递失败,则标记为部分失败 66 | elif failure_count != 0: 67 | schedule_task_proxy.set_attr_by_id( 68 | obj_id=schedule_task_id, field='schedule_task_status', value='part_delivery_failure') 69 | 70 | # 否则标记全部投递成功 71 | else: 72 | schedule_task_proxy.set_attr_by_id( 73 | obj_id=schedule_task_id, field='schedule_task_status', value='delivery_success') 74 | 75 | 76 | @celery_app.task(name='delete_task') 77 | def delete_task(task_id): 78 | """ 79 | 删除任务,因需要删除截图和har文件 故使用异步方式进行删除 80 | :param task_id: 81 | :return: 82 | """ 83 | 84 | # 查询所有schedule task 85 | schedule_task_subquery = schedule_task_proxy.db_session.query(schedule_task_proxy.model.id) \ 86 | .filter(schedule_task_proxy.model.task_id == task_id).subquery() 87 | 88 | # 查询所有subtask 89 | subtask_subquery = subtask_model_proxy.db_session.query(subtask_model_proxy.model.id).filter( 90 | subtask_model_proxy.model.schedule_task_id.in_(schedule_task_subquery)).subquery() 91 | 92 | ###### 删除结果 93 | result_query = result_model_proxy.self_session.filter( 94 | result_model_proxy.model.subtask_id.in_(subtask_subquery)) 95 | 96 | # 删除截图 97 | screenshot_ids = [each.screenshot_id + '.png' for each in 98 | result_query.filter(result_model_proxy.model.screenshot_id.isnot(None)).all()] 99 | remove_files(path='screenshots', file_ids=screenshot_ids) 100 | 101 | # 删除hars 102 | har_ids = [each.har_uuid + '.json' for each in 103 | result_query.filter(result_model_proxy.model.har_uuid.isnot(None)).all()] 104 | remove_files(path='hars', file_ids=har_ids) 105 | 106 | # 删除结果 107 | result_query.delete(synchronize_session=False) 108 | result_model_proxy.safe_commit() 109 | 110 | # 删除Schedule task 111 | schedule_task_proxy.delete_models(ids=schedule_task_subquery, fields='id') 112 | 113 | # 删除subtask 114 | subtask_model_proxy.delete_models(ids=subtask_subquery, fields='id') 115 | 116 | # 删除task_url 117 | task_url_model_proxy.delete_model(task_id=task_id) 118 | -------------------------------------------------------------------------------- /services/engine/worker/library/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/engine/worker/library/favicon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import requests 5 | 6 | from urllib.parse import urljoin, urlparse 7 | from bs4 import BeautifulSoup 8 | 9 | favicon_link_rules = [ 10 | 'icon', 11 | 'shortcut icon', 12 | 'apple-touch-icon', 13 | 'apple-touch-icon-precomposed', 14 | ] 15 | 16 | meta_names = ['msapplication-TileImage', 'og:image'] 17 | 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' 20 | } 21 | 22 | 23 | def get_favicon_link(url, html): 24 | """ 25 | 获取网站图标链接 26 | :param url: 27 | :param html: 28 | :return: 29 | """ 30 | 31 | # 正则匹配网站源码是否包含图标链接 32 | soup = BeautifulSoup(html, features='html.parser') 33 | 34 | # 查找link标签 35 | for rule in favicon_link_rules: 36 | favicon_tag = soup.find('link', attrs={'rel': lambda r: r and r.lower() == rule, 'href': True}) 37 | if favicon_tag: 38 | favicon_href = favicon_tag.get('href', '').strip() 39 | return fmt_link(url, favicon_href) 40 | 41 | # 查找meta标签 42 | for meta_tag in soup.find_all('meta', attrs={'content': True}): 43 | meta_type = meta_tag.get('name') or meta_tag.get('property') or ''.lower() 44 | for name in meta_names: 45 | if meta_type == name.lower(): 46 | favicon_href = meta_tag.get('href', '').strip() 47 | return fmt_link(url, favicon_href) 48 | 49 | # 请求根目录下是否存在/favicon.ico文件 50 | root_icon_link = get_root_dir_icon(url) 51 | if root_icon_link: 52 | return root_icon_link, 'ico' 53 | 54 | return None, None 55 | 56 | 57 | def fmt_link(website_url, href): 58 | """ 59 | 格式化标签 60 | :param website_url: 61 | :param href: 62 | :return: favicon_link, ext 63 | """ 64 | 65 | if not href or href.startswith('data:image/'): 66 | return None, None 67 | 68 | if not urlparse(href).netloc: 69 | href = urljoin(website_url, href) 70 | 71 | if urlparse(href).netloc: 72 | url_parsed = href 73 | else: 74 | url_parsed = urljoin(website_url, href) 75 | 76 | url_parsed = urlparse(url_parsed, scheme=urlparse(website_url).scheme) 77 | _, ext = os.path.splitext(url_parsed.path) 78 | favicon_url = url_parsed.geturl() 79 | try: 80 | response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers) 81 | if response.status_code == 200 and response.headers['Content-Type'].startswith('image'): 82 | return favicon_url, ext[1:].lower() 83 | except Exception as e: 84 | return None, None 85 | return None, None 86 | 87 | 88 | def get_root_dir_icon(url): 89 | try: 90 | parsed = urlparse(url) 91 | favicon_url = parsed.scheme + "://" + parsed.netloc + '/favicon.ico' 92 | response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers) 93 | if response.status_code == 200 and response.headers['Content-Type'].startswith('image'): 94 | return response.url 95 | except Exception as e: 96 | return 97 | 98 | return 99 | -------------------------------------------------------------------------------- /services/engine/worker/library/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import bisect 4 | import hashlib 5 | import math 6 | import os 7 | import random 8 | import uuid 9 | from datetime import datetime 10 | 11 | import requests 12 | from html2text import HTML2Text 13 | 14 | from webs.api.models.db_proxy import subtask_model_proxy 15 | from webs.core.requests import web_client 16 | from worker.library.favicon import get_favicon_link 17 | 18 | 19 | class WeightedRandomGenerator(object): 20 | def __init__(self, weights): 21 | print(weights) 22 | self.weights = weights 23 | self.totals = [] 24 | running_total = 0 25 | 26 | for w in weights: 27 | running_total += w['score'] 28 | self.totals.append(running_total) 29 | 30 | def spawn(self): 31 | rnd = random.random() * self.totals[-1] 32 | index = bisect.bisect_right(self.totals, rnd) 33 | return self.weights[index] 34 | 35 | def __call__(self): 36 | return self.spawn() 37 | 38 | 39 | def split_urls(urls): 40 | """对url列表进行拆分""" 41 | if len(urls) > 100: 42 | m = len(urls) // 100 43 | n = int(math.ceil(len(urls) / float(m))) 44 | chunk_list = [urls[i:i + n] for i in range(0, len(urls), n)] 45 | else: 46 | chunk_list = [urls] 47 | 48 | return chunk_list 49 | 50 | 51 | def send(schedule_task_id, url_nested_list, server_info, options): 52 | # 创建子任务模型 53 | subtask_obj = subtask_model_proxy.create(schedule_task_id, server_id=server_info['server_id']) 54 | 55 | # 发送请求 56 | try: 57 | response = web_client.request( 58 | server=server_info['server_name'], 59 | url=server_info['server_address'] + '/crawl_tasks', 60 | method='POST', timeout=60, 61 | json={ 62 | 'subtask_id': subtask_obj.id, 63 | 'url_nested_list': url_nested_list, 64 | 'options': options 65 | } 66 | ) 67 | failure_msg = '' if response['status'] is True else response['error']['message'] 68 | except Exception as e: 69 | failure_msg = e.message 70 | if failure_msg: 71 | # 设置子任务失败原因 72 | subtask_model_proxy.set_many_attr(obj=subtask_obj, fields_v={ 73 | 'finished': True, 74 | 'finished_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 75 | 'delivery_failure_msg': failure_msg 76 | }) 77 | 78 | 79 | def extract_text(content): 80 | """ 81 | 提取网页正文 82 | :param content: 83 | :return: 84 | """ 85 | 86 | h = HTML2Text(bodywidth=0) 87 | h.ignore_links = True 88 | h.ignore_images = True 89 | h.ignore_tables = True 90 | h.ignore_emphasis = True 91 | try: 92 | result = h.handle(content).replace('*', '').replace('\n\n', '\n') 93 | except Exception as e: 94 | result = None 95 | return '' if result == '\n' else result 96 | 97 | 98 | def save_favicon(url, html): 99 | """ 100 | 保存网站图标 101 | :param url: 102 | :param html: 103 | :return: 104 | """ 105 | favicon_link, icon_ext = get_favicon_link(url, html) 106 | if favicon_link: 107 | try: 108 | response = requests.get(favicon_link, stream=True, timeout=10) 109 | except Exception as e: 110 | return None, None 111 | temp_filename = str(uuid.uuid4()) 112 | save_path = '/usr/src/app/screenshots/{}.{}'.format(temp_filename, icon_ext) 113 | with open(save_path, 'wb+') as image: 114 | for chunk in response.iter_content(1024): 115 | image.write(chunk) 116 | image.seek(0) 117 | favicon_md5 = hashlib.md5(image.read()).hexdigest() 118 | os.rename(save_path, '/usr/src/app/screenshots/{}.{}'.format(favicon_md5, icon_ext)) 119 | return favicon_md5, favicon_link 120 | return None, None 121 | 122 | 123 | def remove_files(path, file_ids): 124 | """ 125 | 文件 126 | :return: 127 | """ 128 | 129 | for file_id in file_ids: 130 | try: 131 | os.remove(f'/usr/src/app/{path}/{file_id}') 132 | except FileNotFoundError as e: 133 | pass 134 | -------------------------------------------------------------------------------- /services/engine/worker/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import re 5 | 6 | from rpc.client.callback_client import CallbackClient 7 | from wappalyzer import wappalyzer_handler 8 | from webs.api.models.db_proxy import result_model_proxy, task_model_proxy 9 | from webs.core.requests import web_client 10 | from worker import celery_app 11 | from worker.library.helper import extract_text, save_favicon 12 | 13 | 14 | def callback_http(callback_address, task_obj, result, finished): 15 | """ 16 | 回调方式为http 17 | :return: 18 | """ 19 | try: 20 | response = web_client.request( 21 | server='callback', method='POST', 22 | url=callback_address, 23 | timeout=60, json={ 24 | 'customer_id': task_obj.customer_id, 25 | 'extra_data': task_obj.extra_data, 26 | 'task_id': task_obj.id, 27 | 'finished': finished, 28 | 'result': result 29 | } 30 | ) 31 | failure_msg = '' if response['status'] is True else response['error']['message'] 32 | except Exception as e: 33 | failure_msg = e.message 34 | if failure_msg and result.get('result_id'): 35 | result_model_proxy.set_attr_by_id(result['result_id'], 'callback_failure_msg', failure_msg) 36 | 37 | 38 | def callback_grpc(callback_address, task_obj, result): 39 | """ 40 | 回调方式为rpc 41 | :return: 42 | """ 43 | 44 | callback_client = CallbackClient(rpc_server=callback_address) 45 | callback_client.callback_save_result(task_obj, result) 46 | 47 | 48 | @celery_app.task(name='save_base_result') 49 | def save_base_result_by_grpc(**kwargs): 50 | """ 51 | 异步回收相关爬取数据 52 | :param kwargs: 53 | :return: 54 | """ 55 | 56 | task_obj = task_model_proxy.query_task_obj_by_subtask(subtask_id=kwargs['subtask_id']) 57 | if not task_obj: 58 | return 59 | 60 | # 解析网站编码 61 | try: 62 | m = re.compile(' /etc/timezone 16 | 17 | # set working directory 18 | RUN mkdir -p /usr/src/app 19 | WORKDIR /usr/src/app 20 | 21 | # add and install requirements 22 | COPY ./requirements.txt /usr/src/app/requirements.txt 23 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \ 24 | pip install -r requirements.txt -i https://pypi.douban.com/simple 25 | 26 | # add app 27 | COPY . /usr/src/app 28 | 29 | # run server 30 | CMD ["/usr/src/app/entrypoint.sh"] 31 | -------------------------------------------------------------------------------- /services/spider/Dockerfile-prod: -------------------------------------------------------------------------------- 1 | FROM harbor.socmap.net/crawloop/playwright-xvfb:v1.0.0 2 | 3 | WORKDIR /usr/src/app 4 | 5 | COPY ./requirements.txt /usr/src/app 6 | 7 | RUN apt update && \ 8 | apt-get -y install netcat && \ 9 | rm -rf /var/lib/apt/lists/* 10 | 11 | 12 | # set timezone 13 | ENV TZ=Asia/Shanghai 14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 15 | 16 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \ 17 | pip install -r requirements.txt -i https://pypi.douban.com/simple 18 | 19 | COPY . /usr/src/app 20 | 21 | RUN sh build.sh 22 | 23 | CMD ["/usr/src/app/entrypoint.sh"] 24 | 25 | 26 | -------------------------------------------------------------------------------- /services/spider/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 清除缓存目录 4 | find . -type d -name __pycache__ | xargs rm -rf 5 | 6 | # 编译代码 7 | python3 compile.py build_ext --inplace 8 | if [ $? -ne 0 ]; then 9 | exit 1 10 | fi 11 | 12 | # 将.so文件改名 13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh 16 | 17 | # 删除.py文件 18 | find ./rpc -name '*.py' | xargs rm -f 19 | find ./webs -name '*.py' | xargs rm -f 20 | find ./worker -name '*.py' | xargs rm -f 21 | 22 | # 清除不需要的文件 23 | rm -rf build 24 | rm -f .gitignore 25 | rm -f compile.py 26 | rm -f build.sh 27 | -------------------------------------------------------------------------------- /services/spider/build_sentry_ini.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | 6 | from configobj import ConfigObj 7 | 8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8') 9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False) 10 | log_ini.write() 11 | -------------------------------------------------------------------------------- /services/spider/compile.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Cython.Build import cythonize 4 | from Cython.Distutils import build_ext 5 | from setuptools import setup 6 | from setuptools.extension import Extension 7 | 8 | setup( 9 | ext_modules=cythonize( 10 | [ 11 | Extension('rpc.*', ['rpc/*.py']), 12 | Extension('rpc.pb.*', ['rpc/pb/*.py']), 13 | Extension('rpc.client.*', ['rpc/client/*.py']), 14 | Extension('webs.*', ['webs/*.py']), 15 | Extension('webs.api.*', ['webs/api/*.py']), 16 | Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']), 17 | Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']), 18 | Extension('webs.api.models*', ['webs/api/models/*.py']), 19 | Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']), 20 | Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']), 21 | Extension('webs.api.utils.*', ['webs/api/utils/*.py']), 22 | Extension('webs.api.views.*', ['webs/api/views/*.py']), 23 | Extension('worker.*', ['worker/*.py']), 24 | Extension('worker.library.*', ['worker/library/*.py']), 25 | ], 26 | build_dir='build', 27 | compiler_directives=dict( 28 | always_allow_keywords=True, language_level=3 29 | ) 30 | ), 31 | cmdclass=dict( 32 | build_ext=build_ext 33 | ) 34 | ) 35 | -------------------------------------------------------------------------------- /services/spider/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # web服务 4 | if [ "$ENDPOINT" = "web" ]; then 5 | # 开发环境 6 | if [ "$FLASK_ENV" = "development" ]; then 7 | flask run -h 0.0.0.0 -p 5000 8 | 9 | # 生产环境 10 | elif [ "$FLASK_ENV" = "production" ]; then 11 | python build_sentry_ini.py 12 | gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app 13 | fi 14 | 15 | # 爬取 16 | elif [ "$ENDPOINT" = "fetch" ]; then 17 | # 开启虚拟显示器 18 | echo "开启xvfb" 19 | rm -rf /tmp/.X99-lock 20 | Xvfb -screen 0 1020x720x16 :99 & 21 | export DISPLAY=:99 22 | celery -A worker.celery_app worker -Q priority_fetch -l info -c $WORK_MAX_COUNT --prefetch-multiplier 1 --max-tasks-per-child 1 -n crawl_fetch@%h 23 | 24 | # 保存结果 25 | elif [ "$ENDPOINT" = "results" ]; then 26 | # celery -A worker.celery_app worker -Q results -l info -c 5 --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h 27 | celery -A worker.celery_app worker -Q results -l info --pool=prefork --concurrency=5 --without-heartbeat --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h 28 | fi 29 | -------------------------------------------------------------------------------- /services/spider/gunicorn_config.py: -------------------------------------------------------------------------------- 1 | # Sample Gunicorn configuration file. 2 | 3 | import multiprocessing as mlp 4 | 5 | # 解决无限递归 6 | import gevent.monkey 7 | 8 | gevent.monkey.patch_all() 9 | 10 | # 11 | # Server socket 12 | # 13 | # bind - The socket to bind. 14 | # 15 | # A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'. 16 | # An IP is a valid HOST. 17 | # 18 | # backlog - The number of pending connections. This refers 19 | # to the number of clients that can be waiting to be 20 | # served. Exceeding this number results in the client 21 | # getting an error when attempting to connect. It should 22 | # only affect servers under significant load. 23 | # 24 | # Must be a positive integer. Generally set in the 64-2048 25 | # range. 26 | # 27 | 28 | bind = '0.0.0.0:5000' 29 | backlog = 2048 30 | 31 | # 32 | # Worker processes 33 | # 34 | # workers - The number of worker processes that this server 35 | # should keep alive for handling requests. 36 | # 37 | # A positive integer generally in the 2-4 x $(NUM_CORES) 38 | # range. You'll want to vary this a bit to find the best 39 | # for your particular application's work load. 40 | # 41 | # worker_class - The type of workers to use. The default 42 | # sync class should handle most 'normal' types of work 43 | # loads. You'll want to read 44 | # http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type 45 | # for information on when you might want to choose one 46 | # of the other worker classes. 47 | # 48 | # A string referring to a Python path to a subclass of 49 | # gunicorn.workers.base.Worker. The default provided values 50 | # can be seen at 51 | # http://docs.gunicorn.org/en/latest/settings.html#worker-class 52 | # 53 | # worker_connections - For the eventlet and gevent worker classes 54 | # this limits the maximum number of simultaneous clients that 55 | # a single process can handle. 56 | # 57 | # A positive integer generally set to around 1000. 58 | # 59 | # timeout - If a worker does not notify the master process in this 60 | # number of seconds it is killed and a new worker is spawned 61 | # to replace it. 62 | # 63 | # Generally set to thirty seconds. Only set this noticeably 64 | # higher if you're sure of the repercussions for sync workers. 65 | # For the non sync workers it just means that the worker 66 | # process is still communicating and is not tied to the length 67 | # of time required to handle a single request. 68 | # 69 | # keepalive - The number of seconds to wait for the next request 70 | # on a Keep-Alive HTTP connection. 71 | # 72 | # A positive integer. Generally set in the 1-5 seconds range. 73 | # 74 | 75 | # Number of processes 76 | workers = mlp.cpu_count() * 2 + 1 77 | 78 | # Threads 79 | threads = mlp.cpu_count() * 2 80 | 81 | worker_class = 'gevent' 82 | worker_connections = 1000 83 | timeout = 30 84 | keepalive = 2 85 | 86 | # 87 | # spew - Install a trace function that spews every line of Python 88 | # that is executed when running the server. This is the 89 | # nuclear option. 90 | # 91 | # True or False 92 | # 93 | 94 | spew = False 95 | 96 | # 97 | # Server mechanics 98 | # 99 | # daemon - Detach the main Gunicorn process from the controlling 100 | # terminal with a standard fork/fork sequence. 101 | # 102 | # True or False 103 | # 104 | # raw_env - Pass environment variables to the execution environment. 105 | # 106 | # pidfile - The path to a pid file to write 107 | # 108 | # A path string or None to not write a pid file. 109 | # 110 | # user - Switch worker processes to run as this user. 111 | # 112 | # A valid user id (as an integer) or the name of a user that 113 | # can be retrieved with a call to pwd.getpwnam(value) or None 114 | # to not change the worker process user. 115 | # 116 | # group - Switch worker process to run as this group. 117 | # 118 | # A valid group id (as an integer) or the name of a user that 119 | # can be retrieved with a call to pwd.getgrnam(value) or None 120 | # to change the worker processes group. 121 | # 122 | # umask - A mask for file permissions written by Gunicorn. Note that 123 | # this affects unix socket permissions. 124 | # 125 | # A valid value for the os.umask(mode) call or a string 126 | # compatible with int(value, 0) (0 means Python guesses 127 | # the base, so values like "0", "0xFF", "0022" are valid 128 | # for decimal, hex, and octal representations) 129 | # 130 | # tmp_upload_dir - A directory to store temporary request data when 131 | # requests are read. This will most likely be disappearing soon. 132 | # 133 | # A path to a directory where the process owner can write. Or 134 | # None to signal that Python should choose one on its own. 135 | # 136 | 137 | # 138 | # Logging 139 | # 140 | # logfile - The path to a log file to write to. 141 | # 142 | # A path string. "-" means log to stdout. 143 | # 144 | # loglevel - The granularity of log output 145 | # 146 | # A string of "debug", "info", "warning", "error", "critical" 147 | # 148 | 149 | errorlog = '-' 150 | loglevel = 'error' 151 | accesslog = '-' 152 | access_log_format = '{"request_address": "%(h)s", ' \ 153 | '"request_time": "%(t)s", ' \ 154 | '"request": "%(r)s", ' \ 155 | '"http_status_code": "%(s)s", ' \ 156 | '"http_request_url": "%(U)s", ' \ 157 | '"http_query_string": "%(q)s", ' \ 158 | '"request_headers": {' \ 159 | '"content-type": "%({content-type}i)s", ' \ 160 | '"content-length": "%({content-length}i)s", ' \ 161 | '"user-agent": "%(a)s"' \ 162 | '}}' 163 | -------------------------------------------------------------------------------- /services/spider/gunicorn_logging.ini: -------------------------------------------------------------------------------- 1 | # Logging configuration 2 | 3 | [loggers] 4 | keys = root, gunicorn.access, gunicorn.error 5 | 6 | [handlers] 7 | keys = access, error, sentry 8 | 9 | [formatters] 10 | keys = json, generic 11 | 12 | # Root logger 13 | # The root logger sends messages to the console and to Sentry. 14 | [logger_root] 15 | handlers = error, sentry 16 | 17 | # Gunicorn loggers 18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'. 19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation. 20 | # Only the critical logger has a handler to send messages to Sentry. 21 | 22 | [logger_gunicorn.access] 23 | level = INFO 24 | handlers = access 25 | propagate = 0 26 | qualname = gunicorn.access 27 | 28 | [logger_gunicorn.error] 29 | level = ERROR 30 | handlers = error, sentry 31 | propagate = 0 32 | qualname = gunicorn.error 33 | 34 | # Handlers 35 | [handler_access] 36 | class = StreamHandler 37 | formatter = json 38 | args = (sys.stdout, ) 39 | 40 | [handler_error] 41 | class = StreamHandler 42 | formatter = json 43 | args = (sys.stderr,) 44 | 45 | [handler_sentry] 46 | class = raven.handlers.logging.SentryHandler 47 | level = ERROR 48 | formatter = generic 49 | sentry_dsn = example 50 | args = [%(sentry_dsn)s] 51 | 52 | [formatter_generic] 53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s 54 | [formatter_json] 55 | class = webs.api.utils.loggers.JSONFormatter -------------------------------------------------------------------------------- /services/spider/manage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask.cli import FlaskGroup 4 | from webs import create_app 5 | 6 | app = create_app() 7 | cli = FlaskGroup(create_app=create_app) 8 | 9 | if __name__ == '__main__': 10 | cli() 11 | -------------------------------------------------------------------------------- /services/spider/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /services/spider/migrations/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # template used to generate migration files 5 | # file_template = %%(rev)s_%%(slug)s 6 | 7 | # set to 'true' to run the environment during 8 | # the 'revision' command, regardless of autogenerate 9 | # revision_environment = false 10 | 11 | 12 | # Logging configuration 13 | [loggers] 14 | keys = root,sqlalchemy,alembic 15 | 16 | [handlers] 17 | keys = console 18 | 19 | [formatters] 20 | keys = generic 21 | 22 | [logger_root] 23 | level = WARN 24 | handlers = console 25 | qualname = 26 | 27 | [logger_sqlalchemy] 28 | level = WARN 29 | handlers = 30 | qualname = sqlalchemy.engine 31 | 32 | [logger_alembic] 33 | level = INFO 34 | handlers = 35 | qualname = alembic 36 | 37 | [handler_console] 38 | class = StreamHandler 39 | args = (sys.stderr,) 40 | level = NOTSET 41 | formatter = generic 42 | 43 | [formatter_generic] 44 | format = %(levelname)-5.5s [%(name)s] %(message)s 45 | datefmt = %H:%M:%S 46 | -------------------------------------------------------------------------------- /services/spider/migrations/env.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import logging 4 | from logging.config import fileConfig 5 | 6 | from sqlalchemy import engine_from_config 7 | from sqlalchemy import pool 8 | 9 | from alembic import context 10 | 11 | # this is the Alembic Config object, which provides 12 | # access to the values within the .ini file in use. 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | fileConfig(config.config_file_name) 18 | logger = logging.getLogger('alembic.env') 19 | 20 | # add your model's MetaData object here 21 | # for 'autogenerate' support 22 | # from myapp import mymodel 23 | # target_metadata = mymodel.Base.metadata 24 | from flask import current_app 25 | config.set_main_option('sqlalchemy.url', 26 | current_app.config.get('SQLALCHEMY_DATABASE_URI')) 27 | target_metadata = current_app.extensions['migrate'].db.metadata 28 | 29 | # other values from the config, defined by the needs of env.py, 30 | # can be acquired: 31 | # my_important_option = config.get_main_option("my_important_option") 32 | # ... etc. 33 | 34 | 35 | def run_migrations_offline(): 36 | """Run migrations in 'offline' mode. 37 | 38 | This configures the context with just a URL 39 | and not an Engine, though an Engine is acceptable 40 | here as well. By skipping the Engine creation 41 | we don't even need a DBAPI to be available. 42 | 43 | Calls to context.execute() here emit the given string to the 44 | script output. 45 | 46 | """ 47 | url = config.get_main_option("sqlalchemy.url") 48 | context.configure( 49 | url=url, target_metadata=target_metadata, literal_binds=True 50 | ) 51 | 52 | with context.begin_transaction(): 53 | context.run_migrations() 54 | 55 | 56 | def run_migrations_online(): 57 | """Run migrations in 'online' mode. 58 | 59 | In this scenario we need to create an Engine 60 | and associate a connection with the context. 61 | 62 | """ 63 | 64 | # this callback is used to prevent an auto-migration from being generated 65 | # when there are no changes to the schema 66 | # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html 67 | def process_revision_directives(context, revision, directives): 68 | if getattr(config.cmd_opts, 'autogenerate', False): 69 | script = directives[0] 70 | if script.upgrade_ops.is_empty(): 71 | directives[:] = [] 72 | logger.info('No changes in schema detected.') 73 | 74 | connectable = engine_from_config( 75 | config.get_section(config.config_ini_section), 76 | prefix='sqlalchemy.', 77 | poolclass=pool.NullPool, 78 | ) 79 | 80 | with connectable.connect() as connection: 81 | context.configure( 82 | connection=connection, 83 | target_metadata=target_metadata, 84 | process_revision_directives=process_revision_directives, 85 | **current_app.extensions['migrate'].configure_args 86 | ) 87 | 88 | with context.begin_transaction(): 89 | context.run_migrations() 90 | 91 | 92 | if context.is_offline_mode(): 93 | run_migrations_offline() 94 | else: 95 | run_migrations_online() 96 | -------------------------------------------------------------------------------- /services/spider/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /services/spider/migrations/versions/81a88acb3641_记录cookies.py: -------------------------------------------------------------------------------- 1 | """记录cookies 2 | 3 | Revision ID: 81a88acb3641 4 | Revises: 8efa2b9dcc87 5 | Create Date: 2020-12-22 15:37:26.700404 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '81a88acb3641' 14 | down_revision = '8efa2b9dcc87' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('results', sa.Column('cookies', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade(): 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('results', 'cookies') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /services/spider/migrations/versions/8efa2b9dcc87_init.py: -------------------------------------------------------------------------------- 1 | """init 2 | 3 | Revision ID: 8efa2b9dcc87 4 | Revises: 5 | Create Date: 2020-12-08 10:22:43.545415 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | from sqlalchemy.dialects import postgresql 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '8efa2b9dcc87' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('crawl_tasks', 22 | sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), 23 | sa.Column('subtask_id', sa.Integer(), nullable=False), 24 | sa.Column('url_nested_list', postgresql.JSONB(astext_type=sa.Text()), nullable=True), 25 | sa.Column('process_state', sa.String(length=30), server_default='readying', nullable=True), 26 | sa.Column('failure_url_ids', sa.ARRAY(sa.Integer()), server_default='{}', nullable=True), 27 | sa.Column('finished_at', sa.TIMESTAMP(), nullable=True), 28 | sa.Column('options', postgresql.JSONB(astext_type=sa.Text()), nullable=True), 29 | sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True), 30 | sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True), 31 | sa.PrimaryKeyConstraint('id') 32 | ) 33 | op.create_index(op.f('ix_crawl_tasks_create_time'), 'crawl_tasks', ['create_time'], unique=False) 34 | op.create_index(op.f('ix_crawl_tasks_subtask_id'), 'crawl_tasks', ['subtask_id'], unique=False) 35 | op.create_index(op.f('ix_crawl_tasks_update_time'), 'crawl_tasks', ['update_time'], unique=False) 36 | op.create_table('results', 37 | sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), 38 | sa.Column('subtask_id', sa.Integer(), nullable=False), 39 | sa.Column('url_id', sa.Integer(), nullable=False), 40 | sa.Column('url_address', sa.String(length=1024), nullable=False), 41 | sa.Column('http_code', sa.Integer(), nullable=True), 42 | sa.Column('title', sa.Text(), nullable=True), 43 | sa.Column('content', sa.Text(), nullable=True), 44 | sa.Column('current_url', sa.String(length=1024), nullable=True), 45 | sa.Column('redirect_chain', postgresql.JSONB(astext_type=sa.Text()), nullable=True), 46 | sa.Column('response_headers', postgresql.JSONB(astext_type=sa.Text()), nullable=True), 47 | sa.Column('har_uuid', sa.String(length=128), nullable=True), 48 | sa.Column('screenshot_id', sa.String(length=128), nullable=True), 49 | sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True), 50 | sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True), 51 | sa.PrimaryKeyConstraint('id') 52 | ) 53 | op.create_index(op.f('ix_results_create_time'), 'results', ['create_time'], unique=False) 54 | op.create_index(op.f('ix_results_subtask_id'), 'results', ['subtask_id'], unique=False) 55 | op.create_index(op.f('ix_results_update_time'), 'results', ['update_time'], unique=False) 56 | op.create_index(op.f('ix_results_url_id'), 'results', ['url_id'], unique=False) 57 | # ### end Alembic commands ### 58 | 59 | 60 | def downgrade(): 61 | # ### commands auto generated by Alembic - please adjust! ### 62 | op.drop_index(op.f('ix_results_url_id'), table_name='results') 63 | op.drop_index(op.f('ix_results_update_time'), table_name='results') 64 | op.drop_index(op.f('ix_results_subtask_id'), table_name='results') 65 | op.drop_index(op.f('ix_results_create_time'), table_name='results') 66 | op.drop_table('results') 67 | op.drop_index(op.f('ix_crawl_tasks_update_time'), table_name='crawl_tasks') 68 | op.drop_index(op.f('ix_crawl_tasks_subtask_id'), table_name='crawl_tasks') 69 | op.drop_index(op.f('ix_crawl_tasks_create_time'), table_name='crawl_tasks') 70 | op.drop_table('crawl_tasks') 71 | # ### end Alembic commands ### 72 | -------------------------------------------------------------------------------- /services/spider/requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.4.3 2 | amqp==2.6.1 3 | appdirs==1.4.4 4 | billiard==3.6.3.0 5 | celery==4.3.0 6 | certifi==2020.11.8 7 | chardet==3.0.4 8 | click==7.1.2 9 | configobj==5.0.6 10 | Cython==0.29.21 11 | Flask==1.1.2 12 | Flask-Migrate==2.4.0 13 | Flask-Redis==0.3.0 14 | Flask-SQLAlchemy==2.3.2 15 | gevent==1.4.0 16 | greenlet==0.4.15 17 | grpcio==1.33.2 18 | grpcio-tools==1.33.2 19 | gunicorn==19.9.0 20 | idna==2.8 21 | importlib-metadata==2.0.0 22 | itsdangerous==1.1.0 23 | Jinja2==2.11.2 24 | kombu==4.6.11 25 | Mako==1.1.3 26 | MarkupSafe==1.1.1 27 | marshmallow==2.19.2 28 | protobuf==3.14.0 29 | psutil==5.7.3 30 | psycopg2-binary==2.7.6.1 31 | pyee==7.0.4 32 | pyppeteer==0.2.2 33 | python-dateutil==2.8.1 34 | python-editor==1.0.4 35 | pytz==2020.4 36 | raven==6.10.0 37 | redis==3.5.3 38 | requests==2.22.0 39 | six==1.15.0 40 | SQLAlchemy==1.3.20 41 | tqdm==4.52.0 42 | urllib3==1.25.11 43 | vine==1.3.0 44 | webargs==4.0.0 45 | websockets==8.1 46 | Werkzeug==1.0.1 47 | zipp==3.4.0 48 | zope.event==4.5.0 49 | zope.interface==5.2.0 50 | -------------------------------------------------------------------------------- /services/spider/rpc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/rpc/client/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/rpc/client/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | 5 | import grpc 6 | 7 | from rpc.pb import result_pb2 8 | from rpc.pb.result_pb2_grpc import ResultStub 9 | 10 | CHUNK_SIZE = 10 * 1024 11 | 12 | 13 | def get_file_chunks(filename, folder_path): 14 | yield result_pb2.StreamUploadPictureRequest(filename=filename) 15 | with open(f'/usr/src/app/{folder_path}/' + filename, 'rb') as f: 16 | while True: 17 | piece = f.read(CHUNK_SIZE) 18 | if len(piece) == 0: 19 | return 20 | yield result_pb2.StreamUploadPictureRequest(file_data={"buffer": piece}) 21 | 22 | 23 | def remove_file(file_path): 24 | """ 25 | 删除文件 26 | :param file_path: 27 | :return: 28 | """ 29 | 30 | try: 31 | os.remove(file_path) 32 | except (NotImplementedError, FileNotFoundError): 33 | pass 34 | 35 | 36 | class ResultClient(object): 37 | 38 | def __init__(self, rpc_server): 39 | # RPC服务器信道 40 | channel = grpc.insecure_channel(target=f'{rpc_server}', options=[ 41 | ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024), 42 | ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024), 43 | ]) 44 | 45 | # 获取Result grpc服务对象 46 | self.stub = ResultStub(channel) 47 | 48 | def save_base_result(self, subtask_id, url_id, url_address, finished_at, **kwargs): 49 | """保存爬虫基本信息""" 50 | 51 | # 返回头部序列化 52 | kwargs['response_headers'] = self.dic2json(kwargs.pop('response_headers', {})) 53 | 54 | # 生成状态码 55 | kwargs['http_code'] = kwargs['redirect_chain'][-1]['redirect_http_code'] if kwargs['redirect_chain'] else None 56 | 57 | # 去除firefox和chrome默认content 58 | if kwargs['content'] and (kwargs['content'].startswith( 59 | '') 60 | or kwargs['content'] == ''): 61 | kwargs['content'] = None 62 | 63 | # # http交互过程序列化 64 | # kwargs['http_archive'] = self.dic2json(kwargs.pop('http_archive', [])) 65 | self.stub.SaveBaseResult( 66 | result_pb2.SaveBaseResultRequest( 67 | subtask_id=subtask_id, url_id=url_id, url_address=url_address, 68 | finished_at=finished_at, **kwargs), 69 | timeout=30 70 | ) 71 | 72 | def upload_screenshot(self, screenshot_name): 73 | """上传截图""" 74 | chunks_generator = get_file_chunks(screenshot_name, folder_path='screenshots') 75 | response = self.stub.StreamUploadPicture(chunks_generator) 76 | file_path = f'/usr/src/app/screenshots/{screenshot_name}' 77 | assert response.length == os.path.getsize(file_path) 78 | remove_file(file_path) 79 | 80 | def set_subtask_status(self, subtask_id, status, finished_at): 81 | """标记子任务爬取状态""" 82 | self.stub.SetSubTaskStatus( 83 | result_pb2.SetSubTaskStatusRequest( 84 | subtask_id=subtask_id, 85 | status=status, 86 | finished_at=finished_at 87 | ), 88 | timeout=30 89 | ) 90 | 91 | def upload_har_file(self, har_file_name): 92 | """上传har文件""" 93 | chunks_generator = get_file_chunks(har_file_name, folder_path='hars') 94 | response = self.stub.StreamUploadHarFile(chunks_generator) 95 | file_path = f'/usr/src/app/hars/{har_file_name}' 96 | assert response.length == os.path.getsize(file_path) 97 | remove_file(file_path) 98 | 99 | @staticmethod 100 | def dic2json(dic): 101 | """某些字段转换为json""" 102 | return json.dumps(dic, ensure_ascii=False) 103 | -------------------------------------------------------------------------------- /services/spider/rpc/pb/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/sources.list: -------------------------------------------------------------------------------- 1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free 2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free 3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free 5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free 6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free -------------------------------------------------------------------------------- /services/spider/webs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | from flask import Flask 6 | 7 | from webs.api.utils.requests import before_request_middleware, \ 8 | after_request_middleware, teardown_appcontext_middleware 9 | from webs.api.utils.responses import JSONResponse, app_error_handler 10 | from webs.api.utils.routers import register_routes as init_routes 11 | from webs.api.utils.settings import init_db 12 | 13 | 14 | def create_app(): 15 | # instantiate the app 16 | app = Flask(__name__) 17 | 18 | # set config 19 | app_settings = os.getenv('APP_SETTINGS') 20 | app.config.from_object(app_settings) 21 | 22 | # register all blueprints 23 | init_routes(app=app) 24 | 25 | # register custom response class 26 | app.response_class = JSONResponse 27 | 28 | # register custom error handler 29 | app_error_handler(app=app) 30 | 31 | # register before request middleware 32 | before_request_middleware(app=app) 33 | 34 | # register after request middleware 35 | after_request_middleware(app=app) 36 | 37 | # register after app context teardown middleware 38 | teardown_appcontext_middleware(app=app) 39 | 40 | # set up extensions 41 | app_db = init_db(app=app) 42 | 43 | # shell context for flask cli 44 | @app.shell_context_processor 45 | def ctx(): 46 | return {'app': app, 'db': app_db} 47 | 48 | return app 49 | -------------------------------------------------------------------------------- /services/spider/webs/api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/webs/api/bizs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/webs/api/bizs/crawl_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models.db_proxy import crawl_task_model_proxy 4 | 5 | 6 | class CrawlTaskBiz(object): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def create_crawl_task(self, subtask_id, url_nested_list, options={}): 12 | """ 13 | 调度爬虫 14 | :param subtask_id: 15 | :param url_nested_list: 16 | :param options: 17 | :return: 18 | """ 19 | 20 | # 创建CrawlTask对象 21 | crawl_task_obj = crawl_task_model_proxy.create( 22 | subtask_id=subtask_id, url_nested_list=url_nested_list, 23 | process_state='readying', options=options) 24 | 25 | # 异步抓取 26 | from worker import celery_app 27 | celery_app.send_task( 28 | name='fetch_tasks', queue='priority_fetch', priority=options['priority'], 29 | kwargs={'crawl_task_id': crawl_task_obj.id}) 30 | -------------------------------------------------------------------------------- /services/spider/webs/api/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/webs/api/exceptions/customs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from werkzeug.exceptions import BadRequest, \ 5 | NotFound, Unauthorized, Forbidden, InternalServerError, Conflict 6 | 7 | 8 | class CustomException(Exception): 9 | """Custom JSON based exception.""" 10 | 11 | status_code = BadRequest.code 12 | message = "" 13 | 14 | def __init__(self, message=None, status_code=None): 15 | """ 16 | :param status_code: response status_code 17 | :param message: exception message 18 | """ 19 | 20 | Exception.__init__(self) 21 | 22 | if message is not None: 23 | self.message = message 24 | if status_code is not None: 25 | self.status_code = status_code 26 | 27 | def to_dict(self): 28 | return { 29 | "status": False, 30 | "error": { 31 | "message": self.message, 32 | "type": str(self.__class__.__name__) 33 | } 34 | } 35 | 36 | 37 | class InvalidContentType(CustomException): 38 | """ 39 | Raised when an invalid Content-Type is provided. 40 | """ 41 | 42 | status_code = BadRequest.code 43 | 44 | 45 | class UnauthorizedAPIRequest(CustomException): 46 | """ 47 | Raise if the user is not authorized. Also used if you want to use HTTP 48 | basic auth. 49 | """ 50 | 51 | status_code = Unauthorized.code 52 | 53 | 54 | class InvalidPermissions(CustomException): 55 | """ 56 | Raise if the user doesn't have the permission for the requested resource 57 | but was authenticated. 58 | """ 59 | 60 | status_code = Forbidden.code 61 | 62 | 63 | class InvalidAPIRequest(CustomException): 64 | """ 65 | Raised when an invalid request has been made. 66 | (e.g. accessed unexisting url, the schema validation did 67 | not pass) 68 | """ 69 | 70 | status_code = BadRequest.code 71 | 72 | 73 | class ServerError(CustomException): 74 | """ 75 | Generic internal error. 76 | Inherit this error for all subsequent 77 | errors that are related to database. 78 | """ 79 | 80 | status_code = InternalServerError.code 81 | 82 | 83 | class DatabaseError(CustomException): 84 | """ 85 | Generic database interaction error. 86 | Inherit this error for all subsequent 87 | errors that are related to database. 88 | """ 89 | 90 | status_code = InternalServerError.code 91 | 92 | 93 | class RecordNotFound(DatabaseError): 94 | """ 95 | Raised when the record was not found in the database. 96 | """ 97 | 98 | status_code = NotFound.code 99 | 100 | 101 | class RecordAlreadyExists(DatabaseError): 102 | """ 103 | Raised in the case of violation of a unique constraint. 104 | """ 105 | 106 | status_code = Conflict.code 107 | 108 | 109 | class PublishError(CustomException): 110 | """ 111 | Raised in the case of violation of a publish error. 112 | """ 113 | 114 | status_code = InternalServerError.code 115 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .base_model import db, redis_store 4 | from .crawl_task import CrawlTask 5 | from .result import Result 6 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/base_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask_sqlalchemy import SQLAlchemy 4 | from flask_redis import FlaskRedis 5 | 6 | db = SQLAlchemy() 7 | redis_store = FlaskRedis() 8 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/crawl_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | 底层爬虫子任务与Url映射关系 6 | """ 7 | 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY 9 | from sqlalchemy.dialects.postgresql import JSONB 10 | 11 | from webs.api.models import db 12 | 13 | 14 | class CrawlTask(db.Model): 15 | __tablename__ = 'crawl_tasks' 16 | 17 | id = Column(BigInteger, primary_key=True, autoincrement=True) 18 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id 19 | url_nested_list = Column(JSONB) # [{"url_id": xxx, "url_address": xxx, 'url_options': {}}] 20 | process_state = Column(String(30), server_default='readying') # readying Started finished 21 | failure_url_ids = Column(ARRAY(Integer), server_default='{}') # 爬取失败url 22 | finished_at = Column(TIMESTAMP) # 完成时间 23 | options = Column(JSONB) # 爬取参数 24 | 25 | # success_count = Column(Integer) # 爬取成功数 26 | # failure_count = Column(Integer) # 爬取失败数 27 | 28 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 29 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 30 | 31 | def __repr__(self): 32 | return f'' 33 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/db_proxy/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .crawl_task import CrawlTaskModelProxy 4 | from .result import ResultModelProxy 5 | 6 | crawl_task_model_proxy = CrawlTaskModelProxy() 7 | result_model_proxy = ResultModelProxy() 8 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/db_proxy/crawl_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import CrawlTask 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class CrawlTaskModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = CrawlTask 11 | 12 | def create(self, **kwargs): 13 | """ 14 | 创建CrawlTask对象 15 | """ 16 | 17 | crawl_task_obj = CrawlTask( 18 | subtask_id=kwargs['subtask_id'], url_nested_list=kwargs['url_nested_list'], 19 | process_state=kwargs['process_state'], options=kwargs['options']) 20 | self.db_session.add(crawl_task_obj) 21 | self.safe_commit() 22 | return crawl_task_obj 23 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/db_proxy/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webs.api.models import Result 4 | from webs.api.models.db_proxy.base import BaseModelProxy 5 | 6 | 7 | class ResultModelProxy(BaseModelProxy): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = Result 11 | 12 | def create(self, subtask_id, url_id, url_address, **kwargs): 13 | """ 14 | 保存爬取结果 15 | :param subtask_id: 16 | :param url_id: 17 | :param url_address: 18 | :param kwargs: 19 | :return: 20 | """ 21 | 22 | result_obj = Result( 23 | subtask_id=subtask_id, url_id=url_id, url_address=url_address, 24 | http_code=kwargs.get('http_code'), title=kwargs.get('title'), 25 | content=kwargs.get('content'), current_url=kwargs.get('current_url'), 26 | har_uuid=kwargs.get('har_uuid'), screenshot_id=kwargs.get('screenshot_id'), 27 | response_headers=kwargs.get('response_headers', {}), redirect_chain=kwargs.get('redirect_chain', []), 28 | cookies=kwargs.get('cookies', []) 29 | ) 30 | self.db_session.add(result_obj) 31 | self.safe_commit() 32 | return result_obj 33 | 34 | def query_already_crawl_url_ids(self, subtask_id): 35 | """ 36 | 查询已经抓取过的url 37 | :param subtask_id: 38 | :return: 39 | """ 40 | 41 | query = self.db_session.query(self.model.url_id).filter(self.model.subtask_id == subtask_id).all() 42 | return [each[0] for each in query] 43 | -------------------------------------------------------------------------------- /services/spider/webs/api/models/result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | 底层存储结果值 作为备份使用 6 | """ 7 | 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text 9 | from sqlalchemy.dialects.postgresql import JSONB 10 | 11 | from webs.api.models import db 12 | 13 | 14 | class Result(db.Model): 15 | __tablename__ = 'results' 16 | 17 | id = Column(BigInteger, primary_key=True, autoincrement=True) 18 | subtask_id = Column(Integer, nullable=False, index=True) # 所属子任务任务id 19 | url_id = Column(Integer, nullable=False, index=True) # url id 20 | url_address = Column(String(1024), nullable=False) # url 地址 21 | http_code = Column(Integer) # 网站状态码 22 | title = Column(Text) # 网站标题 23 | content = Column(Text) # 网站内容 24 | current_url = Column(String(1024)) # 网站最后相应的地址 25 | redirect_chain = Column(JSONB) # 重定向链接 26 | response_headers = Column(JSONB) # response headers 27 | har_uuid = Column(String(128)) # 网站交互过程存储文件 28 | screenshot_id = Column(String(128)) # 截图Id 29 | cookies = Column(JSONB) # cookies 30 | 31 | create_time = Column(TIMESTAMP, server_default=func.now(), index=True) 32 | update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True) 33 | 34 | def __repr__(self): 35 | return f'' 36 | -------------------------------------------------------------------------------- /services/spider/webs/api/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from webs.api.exceptions.customs import InvalidAPIRequest 5 | 6 | 7 | class LengthChecker(object): 8 | """字段长度校验""" 9 | 10 | def __init__(self, sign, length): 11 | self.sign = sign 12 | self.length = length 13 | 14 | def __call__(self, verified): 15 | if verified is not None and len(verified) > self.length: 16 | raise InvalidAPIRequest(f'{self.sign}长度过长!') 17 | 18 | 19 | class OneOf(object): 20 | """Validator which succeeds if ``value`` is a member of ``choices``""" 21 | 22 | def __init__(self, choices): 23 | self.choices = choices 24 | 25 | def __call__(self, verified): 26 | if verified not in self.choices: 27 | raise InvalidAPIRequest(f'请选择{self.choices}其中之一!') 28 | -------------------------------------------------------------------------------- /services/spider/webs/api/schemas/crawl_tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from webargs import fields 4 | 5 | from webs.api.schemas import OneOf 6 | 7 | create_crawl_task_schema = { 8 | 'subtask_id': fields.Int(required=True), 9 | 'url_nested_list': fields.DelimitedList(fields.Nested({ 10 | 'url_id': fields.Int(required=True), 11 | 'url_address': fields.Str(required=True), 12 | 'url_options': fields.Dict(missing={}) 13 | }), required=True), 14 | 'options': fields.Nested({ 15 | 'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])), 16 | 'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])), # 任务优先级 17 | 'headless': fields.Bool(missing=False), # 有头/无头模式 默认使用有头模式 18 | 'debug': fields.Bool(missing=False), # 是否开启调试模式, 19 | 'referer': fields.Str(), # 网站来路地址 20 | 'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])), # 并发数 21 | 'url_timeout': fields.Int(missing=30), # 单个url超时时间 22 | 'enabled_render_js': fields.Bool(missing=True), 23 | 'page_wait_time': fields.Int(missing=3), # 等待页面js渲染时间 24 | 'ignore_ssl': fields.Bool(missing=True), # 是否忽略证书错误 25 | 'screenshot': fields.Bool(missing=False), # 是否截图 26 | 'proxy_url': fields.Str(), # 代理 27 | 'user_agent': fields.Str(), # Ua 28 | 'record_har': fields.Bool(missing=False), # 请求networks 29 | 'record_redirect': fields.Bool(missing=False), # 是否记录重定向链接 30 | 'use_browser_cache': fields.Bool(missing=True), # 是否使用浏览器缓存 31 | 'use_result_cache': fields.Bool(missing=True), # 是否使用结果缓存 32 | 'wappalyzer': fields.Bool(missing=False), # 是否使用指纹识别 33 | 'wait_until': fields.Str( 34 | missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])), # 控制页面何时加载成功 35 | 'rpc_server': fields.Str(required=True) 36 | }, missing={}) 37 | } 38 | -------------------------------------------------------------------------------- /services/spider/webs/api/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/webs/api/utils/loggers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | import socket 6 | import sys 7 | import traceback 8 | from datetime import datetime 9 | 10 | try: 11 | import simplejson as json 12 | except ImportError: 13 | import json 14 | 15 | 16 | class JSONFormatter(logging.Formatter): 17 | """ 18 | JSON formatter for python logging 19 | 20 | You can pass additional tags on a per message basis using the 21 | key "tags" in the extra parameter. 22 | eg: logger.error('hello world!', extra={"tags": ["hello=world"]}) 23 | """ 24 | 25 | def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON', 26 | indent=None): 27 | """ 28 | :param tags: a list of tags to add to every messages 29 | :hostname: force a specific hostname 30 | :fqdn: a boolean to use the FQDN instead of the machine's hostname 31 | :message_type: the message type for Logstash formatters 32 | :indent: indent level of the JSON output 33 | """ 34 | self.message_type = message_type 35 | self.tags = tags if tags is not None else [] 36 | self.extra_tags = [] 37 | self.indent = indent 38 | 39 | if hostname: 40 | self.host = hostname 41 | elif fqdn: 42 | self.host = socket.getfqdn() 43 | else: 44 | self.host = socket.gethostname() 45 | 46 | def get_extra_fields(self, record): 47 | # The list contains all the attributes listed in 48 | # http://docs.python.org/library/logging.html#logrecord-attributes 49 | skip_list = [ 50 | 'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args', 51 | 'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg', 52 | 'msecs', 'msecs', 'message', 'name', 'pathname', 'process', 53 | 'processName', 'relativeCreated', 'thread', 'threadName', 'extra'] 54 | 55 | if sys.version_info < (3, 0): 56 | easy_types = (str, bool, dict, float, int, list, type(None)) 57 | else: 58 | easy_types = (str, bool, dict, float, int, list, type(None)) 59 | 60 | fields = {} 61 | 62 | self.extra_tags = [] 63 | for key, value in record.__dict__.items(): 64 | if key not in skip_list: 65 | if key == 'tags' and isinstance(value, list): 66 | self.extra_tags = value 67 | elif isinstance(value, easy_types): 68 | fields[key] = value if value else "null" 69 | else: 70 | fields[key] = repr(value) 71 | 72 | return fields 73 | 74 | def get_debug_fields(self, record): 75 | if record.exc_info: 76 | exc_info = self.format_exception(record.exc_info) 77 | else: 78 | exc_info = record.exc_text 79 | return { 80 | 'exc_info': exc_info, 81 | 'filename': record.filename, 82 | 'lineno': record.lineno, 83 | } 84 | 85 | @classmethod 86 | def format_source(cls, message_type, host, path): 87 | return "%s://%s/%s" % (message_type, host, path) 88 | 89 | @classmethod 90 | def format_timestamp(cls, time): 91 | return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X")) 92 | 93 | @classmethod 94 | def format_exception(cls, exc_info): 95 | return ''.join(traceback.format_exception(*exc_info)) if exc_info else '' 96 | 97 | @classmethod 98 | def serialize(cls, message, indent=None): 99 | return json.dumps(message, ensure_ascii=False, indent=indent) 100 | 101 | def format(self, record, serialize=True): 102 | old_message = record.getMessage() 103 | try: 104 | new_message = json.loads(old_message) 105 | except json.decoder.JSONDecodeError as e: 106 | message = old_message.replace("'", '"') 107 | new_message = json.loads(message) 108 | except Exception: 109 | new_message = record.getMessage() 110 | # Create message dict 111 | message = { 112 | 'timestamp': self.format_timestamp(record.created), 113 | 'app': os.environ.get('APP_NAME'), 114 | 'host': self.host, 115 | 'environment': os.environ.get('FLASK_ENV'), 116 | 'logger': record.name, 117 | 'level': record.levelname, 118 | 'messages': new_message, 119 | 'path': record.pathname, 120 | 'tags': self.tags[:] 121 | } 122 | 123 | # Add extra fields 124 | message.update(self.get_extra_fields(record)) 125 | 126 | # Add extra tags 127 | if self.extra_tags: 128 | message['tags'].extend(self.extra_tags) 129 | 130 | # If exception, add debug info 131 | if record.exc_info or record.exc_text: 132 | message.update(self.get_debug_fields(record)) 133 | 134 | if serialize: 135 | return self.serialize(message, indent=self.indent) 136 | return message 137 | -------------------------------------------------------------------------------- /services/spider/webs/api/utils/requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from flask import current_app, request 5 | from sqlalchemy.exc import DatabaseError 6 | from webs.api.exceptions.customs import InvalidContentType 7 | from webs.api.models import db 8 | 9 | ACL_ORIGIN = 'Access-Control-Allow-Origin' 10 | ACL_METHODS = 'Access-Control-Allow-Methods' 11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers' 12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials' 13 | ACL_CACHE_CONTROL = 'Cache-Control' 14 | 15 | GET_METHOD = 'GET' 16 | OPTIONS_METHOD = 'OPTIONS' 17 | ALLOWED_ORIGINS = '*' 18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS' 19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \ 20 | 'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type' 21 | ALLOWED_CREDENTIALS = 'true' # Allow send cookie 22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate' 23 | 24 | 25 | def before_request_middleware(app): 26 | app.before_request_funcs.setdefault(None, [ 27 | ensure_request_log, 28 | ensure_content_type, 29 | ]) 30 | 31 | 32 | def after_request_middleware(app): 33 | app.after_request_funcs.setdefault(None, [ 34 | enable_cors, 35 | commit_session, 36 | ]) 37 | 38 | 39 | def teardown_appcontext_middleware(app): 40 | app.teardown_appcontext_funcs = [ 41 | shutdown_session, 42 | ] 43 | 44 | 45 | def ensure_request_log(): 46 | """当为生产环境时,屏蔽中间件日志记录器""" 47 | if current_app.debug: 48 | current_app.logger.info( 49 | "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || " 50 | "Parameters: {param}".format( 51 | time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 52 | client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr), 53 | path=request.full_path, 54 | param=request.data.decode('utf-8'))) 55 | 56 | 57 | def ensure_content_type(): 58 | """ 59 | Ensures that the Content-Type for all requests 60 | is `application-json` or `multipart/form-data`, otherwise appropriate error 61 | is raised. 62 | :raises: InvalidContentType if Content-Type is not `application-json` 63 | or not `multipart/form-data` 64 | """ 65 | 66 | content_type = request.headers.get('Content-Type') 67 | if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \ 68 | (not content_type or not ('application/json' in content_type or 69 | 'multipart/form-data' in content_type)): 70 | raise InvalidContentType( 71 | message='Invalid Content-Type. ' 72 | 'Only `application/json` or `multipart/form-data` is allowed') 73 | 74 | 75 | def enable_cors(response): 76 | """ 77 | Enable Cross-origin resource sharing. 78 | These headers are needed for the clients that 79 | will consume the API via AJAX requests. 80 | """ 81 | if request.method == OPTIONS_METHOD: 82 | response = current_app.make_default_options_response() 83 | response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS 84 | response.headers[ACL_METHODS] = ALLOWED_METHODS 85 | response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS 86 | response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL 87 | 88 | return response 89 | 90 | 91 | def commit_session(response): 92 | """ 93 | Try to commit the db session in the case 94 | of a successful request with status_code 95 | under 400. 96 | """ 97 | if response.status_code >= 400: 98 | return response 99 | try: 100 | db.session.commit() 101 | except DatabaseError: 102 | db.session.rollback() 103 | return response 104 | 105 | 106 | def shutdown_session(exception=None): 107 | """ 108 | Remove the db session and detach from the 109 | database driver after application shutdown. 110 | """ 111 | db.session.remove() 112 | -------------------------------------------------------------------------------- /services/spider/webs/api/utils/routers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pkgutil 4 | 5 | 6 | def register_routes(app): 7 | """Register routes.""" 8 | from .. import views 9 | from flask.blueprints import Blueprint 10 | 11 | for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."): 12 | blueprint_name = name.split('.')[-1] 13 | modules = __import__(name, fromlist="dummy") 14 | blueprint = getattr(modules, blueprint_name) 15 | if isinstance(blueprint, Blueprint): 16 | app.register_blueprint(blueprint) 17 | -------------------------------------------------------------------------------- /services/spider/webs/api/utils/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask_migrate import Migrate 4 | 5 | from webs.api.models import db, redis_store 6 | 7 | 8 | def init_db(app): 9 | """ 10 | Create database if doesn't exist and 11 | create all tables. 12 | """ 13 | 14 | # 初始化pg 15 | db.init_app(app) 16 | migrate = Migrate(compare_type=True, compare_server_default=True) 17 | migrate.init_app(app, db) 18 | 19 | # 初始化Redis 20 | redis_store.init_app(app) 21 | 22 | return db 23 | -------------------------------------------------------------------------------- /services/spider/webs/api/views/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/webs/api/views/crawl_tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask import Blueprint, jsonify 4 | from webargs.flaskparser import use_args 5 | 6 | from webs.api.bizs.crawl_task import CrawlTaskBiz 7 | from webs.api.schemas.crawl_tasks import create_crawl_task_schema 8 | 9 | crawl_tasks = Blueprint('crawl_tasks', __name__, url_prefix='/crawl_tasks') 10 | 11 | 12 | @crawl_tasks.route('', methods=['POST']) 13 | @use_args(create_crawl_task_schema, locations=('json',)) 14 | def create_crawl_task(args): 15 | """ 16 | 创建爬虫任务 17 | :param args: 18 | :return: 19 | """ 20 | crawl_task_biz = CrawlTaskBiz() 21 | data = crawl_task_biz.create_crawl_task(**args) 22 | 23 | return jsonify({ 24 | 'status': True, 25 | 'data': data 26 | }), 201 27 | -------------------------------------------------------------------------------- /services/spider/webs/api/views/ping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from flask import Blueprint, jsonify 4 | 5 | ping = Blueprint('ping', __name__) 6 | 7 | 8 | @ping.route('/ping', methods=['GET']) 9 | def ping_pong(): 10 | """ 11 | 测试服务是否可用 12 | """ 13 | return jsonify({ 14 | "data": "pong", 15 | "status": True 16 | }) 17 | -------------------------------------------------------------------------------- /services/spider/webs/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | 6 | class BaseConfig: 7 | """Base configuration""" 8 | 9 | # Root path of project 10 | PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 11 | 12 | DEBUG = True 13 | TESTING = False 14 | SQLALCHEMY_TRACK_MODIFICATIONS = False 15 | SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True} 16 | SECRET_KEY = os.environ.get('SECRET_KEY') 17 | 18 | # Redis configuration 19 | REDIS_URL = os.environ.get('REDIS_URL') 20 | 21 | 22 | class DevelopmentConfig(BaseConfig): 23 | """Development configuration""" 24 | 25 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') 26 | 27 | 28 | class ProductionConfig(BaseConfig): 29 | """Production configuration""" 30 | 31 | DEBUG = False 32 | SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') 33 | -------------------------------------------------------------------------------- /services/spider/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | from celery import Celery 5 | 6 | ################## 7 | # Celery配置 8 | from kombu import Queue 9 | 10 | from webs import create_app 11 | 12 | 13 | class CeleryConfig(object): 14 | # 任务与劣化为json,从Celery4.0开始,默认序列化器将为json 15 | task_serializer = 'json' 16 | 17 | # 结果序列化为json 18 | result_serializer = 'json' 19 | 20 | # 定时任务过期时间 21 | result_expires = 60 * 60 * 24 22 | 23 | # 关闭worker事件监听 防止队列溢出 24 | worker_send_task_events = False 25 | 26 | # 允许接收的任务类型 27 | accept_content = ["json"] 28 | 29 | # 每个进程预取任务数,启动参数进行覆盖设置,此处仅作为标记使用 30 | worker_prefetch_multiplier = 4 31 | 32 | # 每个worker执行1个任务就销毁重启,启动参数进行覆盖设置,此处仅作为标记使用 33 | worker_max_tasks_per_child = 1 34 | 35 | # 时区设置 36 | timezone = 'Asia/Shanghai' 37 | enable_utc = True 38 | 39 | 40 | ################## 41 | # 初始化celery worker 42 | def init_celery(app=None, celery_type='usual'): 43 | app = app or create_app() 44 | celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL')) 45 | celery_app.config_from_object(CeleryConfig) 46 | 47 | # 导入相关任务模块 48 | if celery_type == 'usual': 49 | celery_app.conf.update(imports=['worker.fetch', 'worker.results']) 50 | celery_app.conf.task_queues = ( 51 | Queue("priority_fetch", queue_arguments={'x-max-priority': 5}), 52 | Queue("results"), 53 | ) 54 | elif celery_type == 'beat': 55 | pass 56 | # celery_app.conf.update( 57 | # imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron']) 58 | # celery_app.conf.update( 59 | # CELERYBEAT_SCHEDULE={ 60 | # } 61 | # ) 62 | 63 | # 在flask上下文中执行 64 | class ContextTask(celery_app.Task): 65 | """Make celery tasks work with Flask app context""" 66 | 67 | def __call__(self, *args, **kwargs): 68 | with app.app_context(): 69 | return self.run(*args, **kwargs) 70 | 71 | celery_app.Task = ContextTask 72 | return celery_app 73 | 74 | 75 | celery_app = init_celery() 76 | # beat_app = init_celery(celery_type='beat') 77 | -------------------------------------------------------------------------------- /services/spider/worker/fetch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | 5 | from billiard.exceptions import SoftTimeLimitExceeded 6 | 7 | from rpc.client.result import ResultClient 8 | from webs.api.models.db_proxy import crawl_task_model_proxy, result_model_proxy 9 | from worker import celery_app 10 | from worker.library.playwright import PlayWrightHandler 11 | 12 | 13 | @celery_app.task( 14 | name='fetch_tasks', queue='priority_fetch', acks_late=True, soft_time_limit=1000, max_retries=1, 15 | default_retry_delay=30, autoretry_for=(Exception,)) 16 | def fetch_tasks(crawl_task_id): 17 | """ 18 | 通过优先级队列取得任务进行抓取 19 | """ 20 | 21 | crawl_task_obj = crawl_task_model_proxy.find_one_with_condition( 22 | crawl_task_model_proxy.model.id == crawl_task_id, 23 | crawl_task_model_proxy.model.process_state != 'finished' 24 | ) 25 | if not crawl_task_obj: 26 | return 27 | 28 | # 设置爬取任务开始 29 | if crawl_task_obj.process_state == 'readying': 30 | crawl_task_model_proxy.set_attr(crawl_task_obj, 'process_state', 'running') 31 | url_nested_list = crawl_task_obj.url_nested_list 32 | 33 | # 导致此情况原因为worker进程异常退出,rabbitmq未确认此消息,worker重启此任务再次被投递 34 | else: # crawl_task_obj.process_state == 'running' 35 | already_url_ids = result_model_proxy.query_already_crawl_url_ids(subtask_id=crawl_task_obj.subtask_id) 36 | url_nested_list = [ 37 | url_info for url_info in crawl_task_obj.url_nested_list 38 | if url_info['url_id'] not in already_url_ids 39 | ] 40 | undone_url_ids = [] 41 | if url_nested_list: 42 | # 执行抓取 43 | playwright_handler = PlayWrightHandler( 44 | subtask_id=crawl_task_obj.subtask_id, 45 | url_nested_list=url_nested_list, 46 | options=crawl_task_obj.options) 47 | undone_url_ids = playwright_handler.run() 48 | 49 | # 设置爬取状态、结束时间、抓取失败的urls 50 | crawl_task_model_proxy.set_many_attr( 51 | obj=crawl_task_obj, 52 | fields_v={ 53 | 'process_state': 'finished', 54 | 'finished_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 55 | 'failure_url_ids': undone_url_ids 56 | } 57 | ) 58 | 59 | ####### 调用engine端rpc服务设置subtask爬取状态 60 | # 连接grpc服务 61 | grpc_result_client = ResultClient(crawl_task_obj.options.get('rpc_server')) 62 | 63 | # 设置Subtask爬取状态 64 | grpc_result_client.set_subtask_status( 65 | subtask_id=crawl_task_obj.subtask_id, status=True, finished_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 66 | -------------------------------------------------------------------------------- /services/spider/worker/library/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /services/spider/worker/library/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from typing import Optional 5 | 6 | if sys.version_info >= (3, 8): 7 | from typing import TypedDict # pylint: disable=no-name-in-module 8 | else: 9 | from typing_extensions import TypedDict 10 | 11 | 12 | class ProxyServer(TypedDict): 13 | server: str 14 | bypass: Optional[str] 15 | username: Optional[str] 16 | password: Optional[str] 17 | 18 | 19 | class RecordHarOptions(TypedDict): 20 | omitContent: Optional[bool] 21 | path: str 22 | -------------------------------------------------------------------------------- /services/spider/worker/results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import hashlib 3 | import json 4 | import os 5 | import time 6 | 7 | from rpc.client.result import ResultClient 8 | from webs.api.models.db_proxy import result_model_proxy 9 | from worker import celery_app 10 | 11 | 12 | @celery_app.task(name='save_results', queue='results') 13 | def save_results(subtask_id, url_id, url_address, rpc_server, **kwargs): 14 | """ 15 | 保存爬取结果 16 | :param subtask_id: 17 | :param url_id: 18 | :param url_address: 19 | :param rpc_server: 20 | :param kwargs: 21 | :return: 22 | """ 23 | 24 | http_archive_dict = kwargs.pop('http_archive_dict') 25 | 26 | # 保存爬取结果,仅作为容灾备份使用 27 | result_model_proxy.create(subtask_id, url_id, url_address) 28 | 29 | # 连接grpc服务 30 | grpc_result_client = ResultClient(rpc_server) 31 | 32 | # 反馈截图 33 | if kwargs.get('screenshot_id') \ 34 | and os.path.exists('/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id'])): 35 | img_path = '/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id']) 36 | try: 37 | with open(img_path, 'rb') as f: 38 | md5 = hashlib.md5() 39 | while True: 40 | fb = f.read(8096) 41 | if not fb: 42 | break 43 | md5.update(fb) 44 | screenshot_md5 = md5.hexdigest() 45 | os.rename(img_path, f'/usr/src/app/screenshots/{screenshot_md5}.png') 46 | kwargs['screenshot_id'] = screenshot_md5 47 | grpc_result_client.upload_screenshot(screenshot_name=f'{screenshot_md5}.png') 48 | except Exception as e: 49 | pass 50 | 51 | # 向engine反馈基本爬取数据 52 | grpc_result_client.save_base_result(subtask_id, url_id, url_address, **kwargs) 53 | 54 | # 反馈har文件 55 | if kwargs.get('har_uuid') and http_archive_dict.get('hars'): 56 | with open('/usr/src/app/hars/{}.json'.format(kwargs['har_uuid']), 'w+', encoding='utf-8') as f: 57 | f.write(json.dumps(http_archive_dict, ensure_ascii=False, indent=2)) 58 | grpc_result_client.upload_har_file(har_file_name='{}.json'.format(kwargs['har_uuid'])) 59 | -------------------------------------------------------------------------------- /spider.docker-conpose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | spider-client: 5 | container_name: spider-client 6 | build: 7 | context: ./services/spider 8 | dockerfile: Dockerfile-dev 9 | volumes: 10 | - './services/spider:/usr/src/app' 11 | ports: 12 | - 15001:5000 13 | env_file: 14 | - spider.env 15 | environment: 16 | - FLASK_APP=webs:create_app 17 | - ENDPOINT=web 18 | restart: always 19 | 20 | spider-fetch: 21 | container_name: spider-fetch 22 | build: 23 | context: ./services/spider 24 | dockerfile: Dockerfile-dev 25 | volumes: 26 | - './services/spider:/usr/src/app' 27 | env_file: 28 | - spider.env 29 | environment: 30 | - ENDPOINT=fetch 31 | restart: always 32 | 33 | 34 | save-results: 35 | container_name: save-results 36 | build: 37 | context: ./services/spider 38 | dockerfile: Dockerfile-dev 39 | volumes: 40 | - './services/spider:/usr/src/app' 41 | env_file: 42 | - spider.env 43 | environment: 44 | - ENDPOINT=results 45 | restart: always -------------------------------------------------------------------------------- /架构图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/架构图.png --------------------------------------------------------------------------------