├── .gitignore
├── .idea
    ├── .gitignore
    ├── crawloop.iml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── vcs.xml
    └── workspace.xml
├── LICENSE
├── README.md
├── engine.docker-compose.yml
├── image_build
    ├── msyh.ttf
    ├── playwright-xvfb
    └── sources.list
├── services
    ├── engine
    │   ├── .dockerignore
    │   ├── Dockerfile-dev
    │   ├── aps
    │   │   ├── __init__.py
    │   │   ├── func.py
    │   │   ├── logger.py
    │   │   └── server.py
    │   ├── apserver.py
    │   ├── build.sh
    │   ├── build_sentry_ini.py
    │   ├── compile.py
    │   ├── entrypoint.sh
    │   ├── grpcserver.py
    │   ├── gunicorn_config.py
    │   ├── gunicorn_logging.ini
    │   ├── manage.py
    │   ├── migrations
    │   │   ├── README
    │   │   ├── alembic.ini
    │   │   ├── env.py
    │   │   ├── script.py.mako
    │   │   └── versions
    │   │   │   ├── 1569921cac58_加入响应时间和渲染时间.py
    │   │   │   ├── 4a243739ef84_初始化.py
    │   │   │   ├── 5b189e0161ee_加入网站编码.py
    │   │   │   ├── 71bf761944f8_新增网站图标字段.py
    │   │   │   └── b3bd5bc9e4e3_增加extra客户端额外数据.py
    │   ├── requirements.txt
    │   ├── rpc
    │   │   ├── __init__.py
    │   │   ├── client
    │   │   │   ├── __init__.py
    │   │   │   └── callback_client.py
    │   │   ├── codegen.sh
    │   │   ├── pb
    │   │   │   ├── __init__.py
    │   │   │   ├── callback_pb2.py
    │   │   │   ├── callback_pb2_grpc.py
    │   │   │   ├── result_pb2.py
    │   │   │   └── result_pb2_grpc.py
    │   │   ├── protos
    │   │   │   └── result.proto
    │   │   └── server
    │   │   │   ├── __init__.py
    │   │   │   └── result.py
    │   ├── sources.list
    │   ├── wappalyzer
    │   │   ├── __init__.py
    │   │   ├── data.json
    │   │   ├── helper.py
    │   │   ├── modelcalss.py
    │   │   └── wappalyzerhandler.py
    │   ├── webs
    │   │   ├── __init__.py
    │   │   ├── api
    │   │   │   ├── __init__.py
    │   │   │   ├── bizs
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── result.py
    │   │   │   │   └── task.py
    │   │   │   ├── exceptions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── customs.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── apscheduler_job.py
    │   │   │   │   ├── base_model.py
    │   │   │   │   ├── db_proxy
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── apschedule.py
    │   │   │   │   │   ├── base.py
    │   │   │   │   │   ├── result.py
    │   │   │   │   │   ├── schedule_task.py
    │   │   │   │   │   ├── server.py
    │   │   │   │   │   ├── subtask.py
    │   │   │   │   │   ├── task.py
    │   │   │   │   │   ├── task_url.py
    │   │   │   │   │   └── url.py
    │   │   │   │   ├── result.py
    │   │   │   │   ├── server.py
    │   │   │   │   ├── task.py
    │   │   │   │   ├── task_url.py
    │   │   │   │   └── url.py
    │   │   │   ├── schemas
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── results.py
    │   │   │   │   └── tasks.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── helper.py
    │   │   │   │   ├── loggers.py
    │   │   │   │   ├── requests.py
    │   │   │   │   ├── responses.py
    │   │   │   │   ├── routers.py
    │   │   │   │   └── settings.py
    │   │   │   └── views
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ping.py
    │   │   │   │   ├── results.py
    │   │   │   │   └── tasks.py
    │   │   ├── config.py
    │   │   └── core
    │   │   │   ├── __init__.py
    │   │   │   └── requests
    │   │   │       ├── __init__.py
    │   │   │       └── request.py
    │   └── worker
    │   │   ├── __init__.py
    │   │   ├── engine.py
    │   │   ├── library
    │   │       ├── __init__.py
    │   │       ├── favicon.py
    │   │       └── helper.py
    │   │   └── result.py
    └── spider
    │   ├── .dockerignore
    │   ├── Dockerfile-dev
    │   ├── Dockerfile-prod
    │   ├── build.sh
    │   ├── build_sentry_ini.py
    │   ├── compile.py
    │   ├── entrypoint.sh
    │   ├── gunicorn_config.py
    │   ├── gunicorn_logging.ini
    │   ├── manage.py
    │   ├── migrations
    │       ├── README
    │       ├── alembic.ini
    │       ├── env.py
    │       ├── script.py.mako
    │       └── versions
    │       │   ├── 81a88acb3641_记录cookies.py
    │       │   └── 8efa2b9dcc87_init.py
    │   ├── requirements.txt
    │   ├── rpc
    │       ├── __init__.py
    │       ├── client
    │       │   ├── __init__.py
    │       │   └── result.py
    │       └── pb
    │       │   ├── __init__.py
    │       │   ├── result_pb2.py
    │       │   └── result_pb2_grpc.py
    │   ├── sources.list
    │   ├── webs
    │       ├── __init__.py
    │       ├── api
    │       │   ├── __init__.py
    │       │   ├── bizs
    │       │   │   ├── __init__.py
    │       │   │   └── crawl_task.py
    │       │   ├── exceptions
    │       │   │   ├── __init__.py
    │       │   │   └── customs.py
    │       │   ├── models
    │       │   │   ├── __init__.py
    │       │   │   ├── base_model.py
    │       │   │   ├── crawl_task.py
    │       │   │   ├── db_proxy
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── base.py
    │       │   │   │   ├── crawl_task.py
    │       │   │   │   └── result.py
    │       │   │   └── result.py
    │       │   ├── schemas
    │       │   │   ├── __init__.py
    │       │   │   └── crawl_tasks.py
    │       │   ├── utils
    │       │   │   ├── __init__.py
    │       │   │   ├── loggers.py
    │       │   │   ├── requests.py
    │       │   │   ├── responses.py
    │       │   │   ├── routers.py
    │       │   │   └── settings.py
    │       │   └── views
    │       │   │   ├── __init__.py
    │       │   │   ├── crawl_tasks.py
    │       │   │   └── ping.py
    │       └── config.py
    │   └── worker
    │       ├── __init__.py
    │       ├── fetch.py
    │       ├── library
    │           ├── __init__.py
    │           ├── helper.py
    │           └── playwright.py
    │       └── results.py
├── spider.docker-conpose.yml
└── 架构图.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/.idea/.gitignore


--------------------------------------------------------------------------------
/.idea/crawloop.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
 5 |     <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
 6 |       <option name="ignoredErrors">
 7 |         <list>
 8 |           <option value="E266" />
 9 |         </list>
10 |       </option>
11 |     </inspection_tool>
12 |   </profile>
13 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/crawloop.iml" filepath="$PROJECT_DIR$/.idea/crawloop.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ChangeListManager">
 4 |     <list default="true" id="847254e0-3485-4a02-8c51-36cca0e4aa62" name="默认变更列表" comment="">
 5 |       <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
 6 |       <change beforePath="$PROJECT_DIR$/build_push.sh" beforeDir="false" />
 7 |       <change beforePath="$PROJECT_DIR$/docker-compose-build.yml" beforeDir="false" />
 8 |       <change beforePath="$PROJECT_DIR$/docker-compose-dev-engine.yml" beforeDir="false" />
 9 |       <change beforePath="$PROJECT_DIR$/docker-compose-dev-spider.yml" beforeDir="false" />
10 |       <change beforePath="$PROJECT_DIR$/docker-compose-engine-deploy.yml" beforeDir="false" afterPath="$PROJECT_DIR$/engine.docker-compose.yml" afterDir="false" />
11 |       <change beforePath="$PROJECT_DIR$/docker-compose-spider-deploy.yml" beforeDir="false" afterPath="$PROJECT_DIR$/spider.docker-conpose.yml" afterDir="false" />
12 |       <change beforePath="$PROJECT_DIR$/playwright_build/msyh.ttf" beforeDir="false" afterPath="$PROJECT_DIR$/image_build/msyh.ttf" afterDir="false" />
13 |       <change beforePath="$PROJECT_DIR$/playwright_build/playwright-xvfb" beforeDir="false" afterPath="$PROJECT_DIR$/image_build/playwright-xvfb" afterDir="false" />
14 |       <change beforePath="$PROJECT_DIR$/playwright_build/pyppeteer-xvfb" beforeDir="false" />
15 |       <change beforePath="$PROJECT_DIR$/playwright_build/sources.list" beforeDir="false" afterPath="$PROJECT_DIR$/image_build/sources.list" afterDir="false" />
16 |       <change beforePath="$PROJECT_DIR$/services/engine/Dockerfile-prod" beforeDir="false" />
17 |       <change beforePath="$PROJECT_DIR$/services/engine/worker/engine.py" beforeDir="false" afterPath="$PROJECT_DIR$/services/engine/worker/engine.py" afterDir="false" />
18 |     </list>
19 |     <option name="SHOW_DIALOG" value="false" />
20 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
21 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
22 |     <option name="LAST_RESOLUTION" value="IGNORE" />
23 |   </component>
24 |   <component name="Git.Settings">
25 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
26 |   </component>
27 |   <component name="ProjectId" id="1uiUnzTHarscZJ312V4a3Nz6FzF" />
28 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true">
29 |     <ConfirmationsSetting value="1" id="Add" />
30 |   </component>
31 |   <component name="ProjectViewState">
32 |     <option name="hideEmptyMiddlePackages" value="true" />
33 |     <option name="showLibraryContents" value="true" />
34 |   </component>
35 |   <component name="PropertiesComponent">
36 |     <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
37 |     <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
38 |     <property name="WebServerToolWindowFactoryState" value="false" />
39 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
40 |   </component>
41 |   <component name="RecentsManager">
42 |     <key name="MoveFile.RECENT_KEYS">
43 |       <recent name="$PROJECT_DIR$" />
44 |     </key>
45 |   </component>
46 |   <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
47 |   <component name="TaskManager">
48 |     <task active="true" id="Default" summary="默认任务">
49 |       <changelist id="847254e0-3485-4a02-8c51-36cca0e4aa62" name="默认变更列表" comment="" />
50 |       <created>1625148082549</created>
51 |       <option name="number" value="Default" />
52 |       <option name="presentableId" value="Default" />
53 |       <updated>1625148082549</updated>
54 |       <workItem from="1625148094846" duration="35000" />
55 |       <workItem from="1625148135285" duration="1328000" />
56 |     </task>
57 |     <servers />
58 |   </component>
59 |   <component name="TypeScriptGeneratedFilesManager">
60 |     <option name="version" value="3" />
61 |   </component>
62 |   <component name="Vcs.Log.Tabs.Properties">
63 |     <option name="TAB_STATES">
64 |       <map>
65 |         <entry key="MAIN">
66 |           <value>
67 |             <State />
68 |           </value>
69 |         </entry>
70 |       </map>
71 |     </option>
72 |   </component>
73 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 用go简单实现了一版playwright爬虫器，发现性能不是python版能比的，后面又用go实现了[网站目录文件爆破](https://github.com/who0sy/WebsiteFuzz)简单版本，跑起来速度简直太爽了。后面有时间会用继续用go重构此项目，python版的不再维护。
 3 | ------------------------------------------------------------------------------------------------------------
 4 | -------------------------------------------------------------------------------------------------------------
 5 | 
 6 | 
 7 | 
 8 | # crawloop
 9 | 基于PlayWright实现对js渲染的动态网页进行抓取，包含网页源码、截图、网站入口、网页交互过程等，支持优先级任务调度。
10 | 
11 | crawloop 目前支持以下特性：
12 | - 原生浏览器环境，支持chrome、firefox，并支持开启容器内虚拟显示器；
13 | - 完整DOM事件收集，自动化触发；
14 | - 全面分析收集，包括js文件，页面源码、网站截图、网站图标、标题、编码、cookies、重定向链等等；
15 | - 基于Wappalyzer实现python版网站指纹识别，主要包含网站使用技术、技术版本号、置信度等;
16 | - 支持Host绑定，可自定义添加Referer；
17 | - 支持请求代理，支持爬虫结果通过http或gRPC主动回调；
18 | - 任务进度监控：支持实时监控任务进度；
19 | - 自定义任务参数：支持在线配置调度任务入参，即时生效；
20 | - 调度中心HA（中心式）：调度采用中心式设计，“调度中心”自研调度组件并支持集群部署，可保证调度中心HA；
21 | - 爬虫执行器HA（分布式）：任务分布式执行，任务"爬虫执行器"支持集群部署，可保证爬虫任务执行HA；
22 | - 弹性扩容缩容：一旦有新的爬虫执行器机器上线或者下线，下次调度时将会重新分配任务；
23 | - 触发策略：提供丰富的任务触发策略，包括：Cron触发、固定间隔触发、固定延时触发、API（事件）触发、人工触发、父子任务触发；
24 | - 阻塞处理策略：调度过于密集爬虫执行器来不及处理时的处理策略，策略包括：单机串行（默认）、丢弃后续调度、覆盖之前调度；
25 | - 任务超时控制：支持自定义任务超时时间，任务运行超时将会主动中断任务；
26 | - 任务失败重试：支持自定义任务失败重试次数，当任务失败时将会按照预设的失败重试次数主动进行重试；其中分片任务支持分片粒度的失败重试；并支持断点续爬；
27 | - 路由策略：爬虫执行器集群部署时提供丰富的路由策略，包括：第一个（已实现）、最后一个（已实现）、轮询（已实现）、加权轮询（已实现）、一致性HASH（待实现）等；
28 | - 动态分片：分片广播任务以任务数量为维度进行分片（默认为100条url为一个分片），以分片为单位下发不同爬虫执行器，协同进行业务处理；在进行大数据量爬虫任务操作时可显著提升任务处理能力和速度；
29 | - 调度线程池：调度系统多线程触发调度运行，确保调度精确执行，不被堵塞；
30 | - 全异步：任务调度流程全异步化设计实现，如异步调度、异步运行、异步回调等，有效对密集调度进行流量削峰，理论上支持任意时长任务的运行；
31 | - 跨语言：调度中心与爬虫执行器提供语言无关的 RESTful API 服务，第三方任意语言可据此对接调度中心或者实现自定义爬虫执行器；
32 | - 任务优先级控制：爬虫执行器实现优先级队列，可对不同优先级任务进行隔离拆分，慢任务或权重较低任务自动降级进入"Slow"队列，避免耗尽爬虫执行器，提高系统灵活性；
33 | - 容器化：项目编译在容器内进行，进一步实现功能开箱即用；
34 | 
35 | 
36 | ### 环境（Docker）
37 | - Docker 18.03+
38 | - Postgresl 9.x+
39 | - Rabbitmq 3.8.x+
40 | - Docker Compose 1.24+
41 | 
42 | 
43 | ## 架构
44 | 
45 | Crawloop的架构包括了一个主节点（Master Node）和多个工作节点（Worker Node），以及负责通信和数据储存的gRPC和Postgresql数据库。
46 | 
47 | ![](架构图.png)
48 | 
49 | 客户端应用向主节点请求数据，主节点通过Celery和Rabbitmq来执行任务派发调度以及负载均衡，工作节点收到任务之后，开始执行爬虫任务，并将任务结果通过gRPC回调给主节点，之后落库存储。
50 | 
51 | 主节点是整个Crawloop架构的核心，属于Crawloop的中控系统。
52 | 
53 | 主节点主要负责以下功能:
54 | 1. 周期性任务调度
55 | 2. 工作节点管理和通信
56 | 3. 对外API服务
57 | 
58 | 主节点负责与客户端进行通信，并通过Celery将爬虫任务基于负载均衡算法异步派发给工作节点。
59 | 
60 | ### 工作节点
61 | 
62 | 工作节点的主要功能是执行爬虫任务和回调抓取数据与日志，并且通过gRPC跟主节点通信。通过增加工作节点数量，Crawloop可以做到横向扩展，不同的爬虫任务可以分配到不同的节点上执行。
63 | 


--------------------------------------------------------------------------------
/engine.docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   engine:
 5 |     container_name: engine
 6 |     build:
 7 |       context: ./services/engine
 8 |       dockerfile: Dockerfile-dev
 9 |     volumes:
10 |       - './services/engine:/usr/src/app'
11 |     ports:
12 |       - 15000:5000
13 |     env_file:
14 |       - engine.env
15 |     environment:
16 |       - FLASK_APP=webs:create_app
17 |       - ENDPOINT=web
18 |     restart: always
19 | 
20 |   engine-worker:
21 |     container_name: engine-worker
22 |     build:
23 |       context: ./services/engine
24 |       dockerfile: Dockerfile-dev
25 |     volumes:
26 |       - './services/engine:/usr/src/app'
27 |     env_file:
28 |       - engine.env
29 |     environment:
30 |       - ENDPOINT=engine-worker
31 |     restart: always
32 | 
33 |   engine-grpc:
34 |     container_name: engine-grpc
35 |     build:
36 |       context: ./services/engine
37 |       dockerfile: Dockerfile-dev
38 |     volumes:
39 |       - './services/engine:/usr/src/app'
40 |     ports:
41 |       - 15002:15002
42 |     env_file:
43 |       - engine.env
44 |     environment:
45 |       - ENDPOINT=engine-grpc
46 |     restart: always
47 | 
48 | engine-apscheduler:
49 |   container_name: engine-apscheduler
50 |   build:
51 |     context: ./services/engine
52 |     dockerfile: Dockerfile-dev
53 |   volumes:
54 |     - './services/engine:/usr/src/app'
55 |   ports:
56 |     - 15003:15003
57 |   env_file:
58 |     - engine.env
59 |   environment:
60 |     - ENDPOINT=apscheduler
61 |   restart: always


--------------------------------------------------------------------------------
/image_build/msyh.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/image_build/msyh.ttf


--------------------------------------------------------------------------------
/image_build/playwright-xvfb:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
 4 | 
 5 | # 替换默认源
 6 | COPY ./sources.list /etc/apt/
 7 | 
 8 | # 安装依赖
 9 | RUN apt-get update && apt-get -y install libnss3 xvfb gconf-service libasound2 libatk1.0-0 libc6 libcairo2 libcups2 \
10 |   libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 \
11 |   libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 \
12 |   libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 \
13 |   libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
14 | 
15 | # 设置时区
16 | ENV TZ=Asia/Shanghai
17 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
18 | 
19 | # 设置默认字体库
20 | COPY ./msyh.ttf /usr/share/fonts/
21 | RUN fc-cache -fv
22 | 
23 | # 开启xvfb虚拟显示器
24 | RUN Xvfb -screen 0 1020x720x16 :99 &
25 | RUN export DISPLAY=:99
26 | ENV DISPLAY=:99
27 | 
28 | # 安装playwright驱动
29 | RUN pip install playwright==0.162.2 -i https://pypi.douban.com/simple
30 | RUN python -m playwright install
31 | 


--------------------------------------------------------------------------------
/image_build/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free


--------------------------------------------------------------------------------
/services/engine/.dockerignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.env
 3 | .dockerignore
 4 | Dockerfile-dev
 5 | Dockerfile-prod
 6 | htmlcov
 7 | celerybeat-shcedule
 8 | scheduler.lock
 9 | celerybeat.pid
10 | 
11 | 


--------------------------------------------------------------------------------
/services/engine/Dockerfile-dev:
--------------------------------------------------------------------------------
 1 | # base image
 2 | FROM python:3.7-slim
 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
 4 | 
 5 | # update apt source
 6 | COPY ./sources.list /etc/apt/
 7 | 
 8 | # install netcat
 9 | RUN apt-get update && \
10 |     apt-get -y install netcat && \
11 |     apt-get clean
12 | 
13 | # set timezone
14 | ENV TZ=Asia/Shanghai
15 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
16 | 
17 | # set working directory
18 | RUN mkdir -p /usr/src/app
19 | WORKDIR /usr/src/app
20 | 
21 | # add and install requirements
22 | COPY ./requirements.txt /usr/src/app/requirements.txt
23 | RUN pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ && \
24 |     pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
25 | 
26 | # add app
27 | COPY . /usr/src/app
28 | 
29 | # run server
30 | CMD ["/usr/src/app/entrypoint.sh"]
31 | 


--------------------------------------------------------------------------------
/services/engine/aps/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/services/engine/aps/func.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from aps.logger import scheduler_logger
 4 | from webs.api.models.db_proxy import task_model_proxy, schedule_task_proxy
 5 | from manage import app
 6 | 
 7 | 
 8 | def send_task_func(task_id):
 9 |     """下发爬取任务"""
10 |     with app.app_context():
11 |         scheduler_logger.info(f'开始调度任务：【task-id：{task_id}】')
12 |         task_obj = task_model_proxy.find(id=task_id)
13 |         if not task_obj:
14 |             scheduler_logger.info(f'该任务已完成或已删除！：【task-id：{task_id}】')
15 |             return
16 | 
17 |         # 判断当前主任务下的调度任务是否大于最大实例数
18 |         running_schedule_tasks = schedule_task_proxy.query_running_schedule_tasks(task_id)
19 |         max_instances = task_obj.schedule_options.get('schedule_data', {}).get('max_instances', 1)
20 |         if len(running_schedule_tasks) >= max_instances:
21 |             scheduler_logger.info(f'该任务已超过最大实例数，此次调度已忽略！：【task-id：{task_id}】')
22 |             return
23 | 
24 |     # 异步切割任务下发
25 |     from worker import celery_app
26 |     celery_app.send_task(
27 |         name='delivery_task', queue='engine',
28 |         kwargs={'task_id': task_id}
29 |     )
30 | 


--------------------------------------------------------------------------------
/services/engine/aps/logger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | 
 5 | scheduler_logger = logging.getLogger('scheduler')
 6 | stream_handler = logging.StreamHandler()
 7 | fmt = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
 8 | stream_handler.setFormatter(fmt)
 9 | scheduler_logger.addHandler(stream_handler)  # 输出到终端
10 | scheduler_logger.setLevel(logging.INFO)
11 | 


--------------------------------------------------------------------------------
/services/engine/aps/server.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | 
  5 | import rpyc
  6 | from apscheduler.jobstores.base import JobLookupError
  7 | from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
  8 | from apscheduler.schedulers import SchedulerAlreadyRunningError, SchedulerNotRunningError
  9 | from apscheduler.schedulers.background import BackgroundScheduler
 10 | 
 11 | from aps.func import send_task_func
 12 | from aps.logger import scheduler_logger
 13 | 
 14 | 
 15 | ##################
 16 | # APScheduler配置
 17 | 
 18 | 
 19 | class APSchedulerConfig(object):
 20 |     # 时区
 21 |     timezone = 'Asia/Shanghai'
 22 | 
 23 |     # 后端存储器
 24 |     jobstores = {
 25 |         'default': SQLAlchemyJobStore(url=os.getenv('DATABASE_URL'), tablename='apscheduler_jobs')
 26 |     }
 27 | 
 28 |     # 执行器
 29 |     executors = {
 30 |         'default': {'type': 'threadpool', 'max_workers': 10}
 31 |     }
 32 | 
 33 |     # 调度器设置
 34 |     job_defaults = {
 35 |         'coalesce': True,  # 是否启用合并运行（在几个运行时间同时到期时只运行一次）
 36 |         'misfire_grace_time': 3600,  # 任务的执行时间可以延迟多少秒 用于任务时间到达到时，执行器未启动下次重启时任务可以延迟时间
 37 |         'max_instances': 1  # 最大实例数
 38 |     }
 39 | 
 40 | 
 41 | ##################
 42 | # APScheduler调度器
 43 | class APSchedulerService(rpyc.Service):
 44 |     @staticmethod
 45 |     def start():
 46 |         try:
 47 |             apscheduler.start(paused=False)
 48 |             scheduler_logger.info('Started APScheduler Success!')
 49 |         except SchedulerAlreadyRunningError:
 50 |             scheduler_logger.info('APScheduler Already Running!')
 51 | 
 52 |     @staticmethod
 53 |     def shutdown():
 54 |         try:
 55 |             apscheduler.shutdown()
 56 |         except SchedulerNotRunningError:
 57 |             scheduler_logger.info('Scheduler has been shut down!')
 58 | 
 59 |     @staticmethod
 60 |     def exposed_add_task(task_id, schedule_type, schedule_data):
 61 |         """
 62 |         添加调度任务
 63 |         :param task_id:
 64 |         :param schedule_type:
 65 |         :param schedule_data:
 66 |         :return:
 67 |         """
 68 |         trigger_map = {'instantly': None, 'datetime': 'date'}
 69 |         apscheduler.add_job(
 70 |             func=send_task_func, id=str(task_id), kwargs={'task_id': task_id},
 71 |             trigger=trigger_map.get(schedule_type, schedule_type),
 72 |             **schedule_data
 73 |         )
 74 | 
 75 |     @staticmethod
 76 |     def exposed_delete_task(task_id, jobstore=None):
 77 |         """
 78 |         删除调度任务
 79 |         :param task_id:
 80 |         :param jobstore:
 81 |         :return:
 82 |         """
 83 |         try:
 84 |             apscheduler.remove_job(job_id=str(task_id), jobstore=jobstore)
 85 |         except JobLookupError:
 86 |             scheduler_logger.warning('Job was not found or this job has ended!')
 87 | 
 88 |     @staticmethod
 89 |     def exposed_pause_task(task_id, jobstore=None):
 90 |         """
 91 |         暂停调度任务
 92 |         :param task_id:
 93 |         :param jobstore:
 94 |         :return:
 95 |         """
 96 | 
 97 |         try:
 98 |             apscheduler.pause_job(job_id=str(task_id), jobstore=jobstore)
 99 |         except JobLookupError:
100 |             scheduler_logger.warning('Job was not found or this job has ended!')
101 | 
102 |     @staticmethod
103 |     def exposed_resume_task(task_id, jobstore=None):
104 |         """
105 |         恢复调度任务
106 |         :param task_id:
107 |         :param jobstore:
108 |         :return:
109 |         """
110 | 
111 |         try:
112 |             apscheduler.resume_job(job_id=str(task_id), jobstore=jobstore)
113 |         except JobLookupError:
114 |             scheduler_logger.warning('Job was not found or this job has ended!')
115 | 
116 | 
117 | ###### 创建APScheduler
118 | apscheduler = BackgroundScheduler(
119 |     jobstores=APSchedulerConfig.jobstores, executors=APSchedulerConfig.executors,
120 |     job_defaults=APSchedulerConfig.job_defaults, timezone=APSchedulerConfig.timezone)
121 | 
122 | ###### 创建APScheduler调度对象，供业务方调用
123 | apscheduler_server = APSchedulerService()
124 | 


--------------------------------------------------------------------------------
/services/engine/apserver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from rpyc import ThreadedServer
 5 | 
 6 | from aps.server import apscheduler_server, APSchedulerService
 7 | 
 8 | 
 9 | def runserver():
10 |     """运行APSchedule RPC服务"""
11 | 
12 |     # 在后台运行APS
13 |     apscheduler_server.start()
14 | 
15 |     # 启动RPC承载APScheduler
16 |     server = ThreadedServer(
17 |         APSchedulerService, port=15003,
18 |         protocol_config={'allow_public_attrs': True, 'allow_pickle': True})
19 | 
20 |     # 启动RPC服务
21 |     try:
22 |         server.start()
23 |     except (KeyboardInterrupt, SystemExit):
24 |         pass
25 |     finally:
26 |         apscheduler_server.shutdown()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     runserver()
31 | 


--------------------------------------------------------------------------------
/services/engine/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 清除缓存目录
 4 | find . -type d -name __pycache__ | xargs rm -rf
 5 | 
 6 | # 编译代码
 7 | python3 compile.py build_ext --inplace
 8 | if [ $? -ne 0 ]; then
 9 |   exit 1
10 | fi
11 | 
12 | # 将.so文件改名
13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
16 | 
17 | # 删除.py文件
18 | find ./rpc -name '*.py' | xargs rm -f
19 | find ./webs -name '*.py' | xargs rm -f
20 | find ./worker -name '*.py' | xargs rm -f
21 | 
22 | # 清除不需要的文件
23 | rm -rf build
24 | rm -f .gitignore
25 | rm -f compile.py
26 | rm -f build.sh
27 | 


--------------------------------------------------------------------------------
/services/engine/build_sentry_ini.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import os
 5 | 
 6 | from configobj import ConfigObj
 7 | 
 8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8')
 9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False)
10 | log_ini.write()
11 | 


--------------------------------------------------------------------------------
/services/engine/compile.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from Cython.Build import cythonize
 4 | from Cython.Distutils import build_ext
 5 | from setuptools import setup
 6 | from setuptools.extension import Extension
 7 | 
 8 | setup(
 9 |     ext_modules=cythonize(
10 |         [
11 |             Extension('rpc.*', ['rpc/*.py']),
12 |             Extension('rpc.client.*', ['rpc/client/*.py']),
13 |             Extension('rpc.pb.*', ['rpc/pb/*.py']),
14 |             Extension('rpc.server.*', ['rpc/server/*.py']),
15 |             Extension('webs.*', ['webs/*.py']),
16 |             Extension('webs.api.*', ['webs/api/*.py']),
17 |             Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']),
18 |             Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']),
19 |             Extension('webs.api.models*', ['webs/api/models/*.py']),
20 |             Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']),
21 |             Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']),
22 |             Extension('webs.api.utils.*', ['webs/api/utils/*.py']),
23 |             Extension('webs.api.views.*', ['webs/api/views/*.py']),
24 |             Extension('webs.core.*', ['webs/core/*.py']),
25 |             Extension('webs.core.requests.*', ['webs/core/requests/*.py']),
26 |             Extension('worker.*', ['worker/*.py']),
27 |             Extension('worker.library.*', ['worker/library/*.py'])
28 |         ],
29 |         build_dir='build',
30 |         compiler_directives=dict(
31 |             always_allow_keywords=True, language_level=3
32 |         )
33 |     ),
34 |     cmdclass=dict(
35 |         build_ext=build_ext
36 |     )
37 | )
38 | 


--------------------------------------------------------------------------------
/services/engine/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # web服务
 4 | if [ "$ENDPOINT" = "web" ]; then
 5 |     # 开发环境
 6 |   if [ "$FLASK_ENV" = "development" ]; then
 7 |     flask run -h 0.0.0.0 -p 5000
 8 | 
 9 |   # 生产环境
10 |   elif [ "$FLASK_ENV" = "production" ]; then
11 | 
12 |     # 注册sentry
13 |     python build_sentry_ini.py
14 | 
15 |     # 使用gunicorn承载flask服务
16 |     gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app
17 |   fi
18 | 
19 | # grpc服务端
20 | elif [ "$ENDPOINT" = "engine-grpc" ]; then
21 |   python grpcserver.py
22 | 
23 | # Apscheduler
24 | elif [ "$ENDPOINT" = "apscheduler" ]; then
25 |   python apserver.py
26 | 
27 | # worker
28 | elif [ "$ENDPOINT" = "engine-worker" ]; then
29 | #  celery -A worker.celery_app worker -Q engine,base_result -l info -c 5 -n worker_engine@%h
30 |   celery -A worker.celery_app worker -Q engine,base_result -l info --pool=prefork --concurrency=10 --prefetch-multiplier 4  --without-heartbeat -n worker_engine@%h
31 | fi
32 | 


--------------------------------------------------------------------------------
/services/engine/grpcserver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import time
 5 | from concurrent import futures
 6 | 
 7 | import grpc
 8 | 
 9 | from rpc.pb import result_pb2_grpc
10 | from rpc.server.result import ResultServicer
11 | 
12 | 
13 | # 运行函数
14 | def run():
15 |     # 以线程池运行rpc服务
16 |     server = grpc.server(
17 |         futures.ThreadPoolExecutor(max_workers=os.getenv('GRPC_SERVER_MAX_WORKER_COUNT', 10)),
18 |         options=[
19 |             (
20 |                 'grpc.max_send_message_length',
21 |                 os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200) * 1024 * 1024
22 |             ),
23 |             (
24 |                 'grpc.max_receive_message_length',
25 |                 os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200) * 1024 * 1024
26 |             ),
27 |         ]
28 |     )
29 | 
30 |     ###### 注册服务
31 |     # 保存爬虫基本信息
32 |     result_pb2_grpc.add_ResultServicer_to_server(ResultServicer(), server)
33 | 
34 |     # 设置服务器监听地址
35 |     server.add_insecure_port(address='0.0.0.0:15002')
36 | 
37 |     # 启动服务
38 |     server.start()
39 | 
40 |     # 阻塞rpc服务
41 |     try:
42 |         while True:
43 |             time.sleep(60 * 60 * 24)
44 |     except KeyboardInterrupt:
45 |         server.stop(0)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     run()
50 | 


--------------------------------------------------------------------------------
/services/engine/gunicorn_config.py:
--------------------------------------------------------------------------------
  1 | # Sample Gunicorn configuration file.
  2 | 
  3 | import multiprocessing as mlp
  4 | 
  5 | # 解决无限递归
  6 | import os
  7 | 
  8 | import gevent.monkey
  9 | 
 10 | gevent.monkey.patch_all()
 11 | 
 12 | #
 13 | # Server socket
 14 | #
 15 | #   bind - The socket to bind.
 16 | #
 17 | #       A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'.
 18 | #       An IP is a valid HOST.
 19 | #
 20 | #   backlog - The number of pending connections. This refers
 21 | #       to the number of clients that can be waiting to be
 22 | #       served. Exceeding this number results in the client
 23 | #       getting an error when attempting to connect. It should
 24 | #       only affect servers under significant load.
 25 | #
 26 | #       Must be a positive integer. Generally set in the 64-2048
 27 | #       range.
 28 | #
 29 | 
 30 | bind = '0.0.0.0:5000'
 31 | backlog = 2048
 32 | 
 33 | #
 34 | # Worker processes
 35 | #
 36 | #   workers - The number of worker processes that this server
 37 | #       should keep alive for handling requests.
 38 | #
 39 | #       A positive integer generally in the 2-4 x $(NUM_CORES)
 40 | #       range. You'll want to vary this a bit to find the best
 41 | #       for your particular application's work load.
 42 | #
 43 | #   worker_class - The type of workers to use. The default
 44 | #       sync class should handle most 'normal' types of work
 45 | #       loads. You'll want to read
 46 | #       http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type
 47 | #       for information on when you might want to choose one
 48 | #       of the other worker classes.
 49 | #
 50 | #       A string referring to a Python path to a subclass of
 51 | #       gunicorn.workers.base.Worker. The default provided values
 52 | #       can be seen at
 53 | #       http://docs.gunicorn.org/en/latest/settings.html#worker-class
 54 | #
 55 | #   worker_connections - For the eventlet and gevent worker classes
 56 | #       this limits the maximum number of simultaneous clients that
 57 | #       a single process can handle.
 58 | #
 59 | #       A positive integer generally set to around 1000.
 60 | #
 61 | #   timeout - If a worker does not notify the master process in this
 62 | #       number of seconds it is killed and a new worker is spawned
 63 | #       to replace it.
 64 | #
 65 | #       Generally set to thirty seconds. Only set this noticeably
 66 | #       higher if you're sure of the repercussions for sync workers.
 67 | #       For the non sync workers it just means that the worker
 68 | #       process is still communicating and is not tied to the length
 69 | #       of time required to handle a single request.
 70 | #
 71 | #   keepalive - The number of seconds to wait for the next request
 72 | #       on a Keep-Alive HTTP connection.
 73 | #
 74 | #       A positive integer. Generally set in the 1-5 seconds range.
 75 | #
 76 | 
 77 | # Number of processes
 78 | workers = mlp.cpu_count() * 2 + 1
 79 | 
 80 | # Threads
 81 | threads = mlp.cpu_count() * 2
 82 | 
 83 | worker_class = 'gevent'
 84 | worker_connections = 1000
 85 | timeout = os.getenv('GUNICORN_TIMEOUT', 180)
 86 | keepalive = 2
 87 | 
 88 | #
 89 | #   spew - Install a trace function that spews every line of Python
 90 | #       that is executed when running the server. This is the
 91 | #       nuclear option.
 92 | #
 93 | #       True or False
 94 | #
 95 | 
 96 | spew = False
 97 | 
 98 | #
 99 | # Server mechanics
100 | #
101 | #   daemon - Detach the main Gunicorn process from the controlling
102 | #       terminal with a standard fork/fork sequence.
103 | #
104 | #       True or False
105 | #
106 | #   raw_env - Pass environment variables to the execution environment.
107 | #
108 | #   pidfile - The path to a pid file to write
109 | #
110 | #       A path string or None to not write a pid file.
111 | #
112 | #   user - Switch worker processes to run as this user.
113 | #
114 | #       A valid user id (as an integer) or the name of a user that
115 | #       can be retrieved with a call to pwd.getpwnam(value) or None
116 | #       to not change the worker process user.
117 | #
118 | #   group - Switch worker process to run as this group.
119 | #
120 | #       A valid group id (as an integer) or the name of a user that
121 | #       can be retrieved with a call to pwd.getgrnam(value) or None
122 | #       to change the worker processes group.
123 | #
124 | #   umask - A mask for file permissions written by Gunicorn. Note that
125 | #       this affects unix socket permissions.
126 | #
127 | #       A valid value for the os.umask(mode) call or a string
128 | #       compatible with int(value, 0) (0 means Python guesses
129 | #       the base, so values like "0", "0xFF", "0022" are valid
130 | #       for decimal, hex, and octal representations)
131 | #
132 | #   tmp_upload_dir - A directory to store temporary request data when
133 | #       requests are read. This will most likely be disappearing soon.
134 | #
135 | #       A path to a directory where the process owner can write. Or
136 | #       None to signal that Python should choose one on its own.
137 | #
138 | 
139 | #
140 | #   Logging
141 | #
142 | #   logfile - The path to a log file to write to.
143 | #
144 | #       A path string. "-" means log to stdout.
145 | #
146 | #   loglevel - The granularity of log output
147 | #
148 | #       A string of "debug", "info", "warning", "error", "critical"
149 | #
150 | 
151 | errorlog = '-'
152 | loglevel = 'error'
153 | accesslog = '-'
154 | access_log_format = '{"request_address": "%(h)s", ' \
155 |                     '"request_time": "%(t)s", ' \
156 |                     '"request": "%(r)s", ' \
157 |                     '"http_status_code": "%(s)s", ' \
158 |                     '"http_request_url": "%(U)s", ' \
159 |                     '"http_query_string": "%(q)s", ' \
160 |                     '"request_headers": {' \
161 |                     '"content-type": "%({content-type}i)s", ' \
162 |                     '"content-length": "%({content-length}i)s", ' \
163 |                     '"user-agent": "%(a)s"' \
164 |                     '}}'
165 | 


--------------------------------------------------------------------------------
/services/engine/gunicorn_logging.ini:
--------------------------------------------------------------------------------
 1 | # Logging configuration
 2 | 
 3 | [loggers]
 4 | keys = root, gunicorn.access, gunicorn.error
 5 | 
 6 | [handlers]
 7 | keys = access, error, sentry
 8 | 
 9 | [formatters]
10 | keys = json, generic
11 | 
12 | # Root logger
13 | # The root logger sends messages to the console and to Sentry.
14 | [logger_root]
15 | handlers = error, sentry
16 | 
17 | # Gunicorn loggers
18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'.
19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation.
20 | # Only the critical logger has a handler to send messages to Sentry.
21 | 
22 | [logger_gunicorn.access]
23 | level = INFO
24 | handlers = access
25 | propagate = 0
26 | qualname = gunicorn.access
27 | 
28 | [logger_gunicorn.error]
29 | level = ERROR
30 | handlers = error, sentry
31 | propagate = 0
32 | qualname = gunicorn.error
33 | 
34 | # Handlers
35 | [handler_access]
36 | class = StreamHandler
37 | formatter = json
38 | args = (sys.stdout, )
39 | 
40 | [handler_error]
41 | class = StreamHandler
42 | formatter = json
43 | args = (sys.stderr,)
44 | 
45 | [handler_sentry]
46 | class = raven.handlers.logging.SentryHandler
47 | level = ERROR
48 | formatter = generic
49 | sentry_dsn = example
50 | args = [%(sentry_dsn)s]
51 | 
52 | [formatter_generic]
53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s
54 | [formatter_json]
55 | class = webs.api.utils.loggers.JSONFormatter


--------------------------------------------------------------------------------
/services/engine/manage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import click
 4 | from flask.cli import FlaskGroup
 5 | from webs import create_app
 6 | 
 7 | app = create_app()
 8 | cli = FlaskGroup(create_app=create_app)
 9 | 
10 | 
11 | @cli.command('add_spider_server')
12 | @click.argument('address')
13 | def _add_spider_server(address):
14 |     from webs.api.utils.helper import add_spider_server
15 |     add_spider_server(address)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     cli()
20 | 


--------------------------------------------------------------------------------
/services/engine/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/services/engine/migrations/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # template used to generate migration files
 5 | # file_template = %%(rev)s_%%(slug)s
 6 | 
 7 | # set to 'true' to run the environment during
 8 | # the 'revision' command, regardless of autogenerate
 9 | # revision_environment = false
10 | 
11 | 
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic
15 | 
16 | [handlers]
17 | keys = console
18 | 
19 | [formatters]
20 | keys = generic
21 | 
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 | 
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 | 
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 | 
37 | [handler_console]
38 | class = StreamHandler
39 | args = (sys.stderr,)
40 | level = NOTSET
41 | formatter = generic
42 | 
43 | [formatter_generic]
44 | format = %(levelname)-5.5s [%(name)s] %(message)s
45 | datefmt = %H:%M:%S
46 | 


--------------------------------------------------------------------------------
/services/engine/migrations/env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | import logging
 4 | from logging.config import fileConfig
 5 | 
 6 | from sqlalchemy import engine_from_config
 7 | from sqlalchemy import pool
 8 | 
 9 | from alembic import context
10 | 
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | fileConfig(config.config_file_name)
18 | logger = logging.getLogger('alembic.env')
19 | 
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | from flask import current_app
25 | config.set_main_option('sqlalchemy.url',
26 |                        current_app.config.get('SQLALCHEMY_DATABASE_URI'))
27 | target_metadata = current_app.extensions['migrate'].db.metadata
28 | 
29 | # other values from the config, defined by the needs of env.py,
30 | # can be acquired:
31 | # my_important_option = config.get_main_option("my_important_option")
32 | # ... etc.
33 | 
34 | 
35 | def run_migrations_offline():
36 |     """Run migrations in 'offline' mode.
37 | 
38 |     This configures the context with just a URL
39 |     and not an Engine, though an Engine is acceptable
40 |     here as well.  By skipping the Engine creation
41 |     we don't even need a DBAPI to be available.
42 | 
43 |     Calls to context.execute() here emit the given string to the
44 |     script output.
45 | 
46 |     """
47 |     url = config.get_main_option("sqlalchemy.url")
48 |     context.configure(
49 |         url=url, target_metadata=target_metadata, literal_binds=True
50 |     )
51 | 
52 |     with context.begin_transaction():
53 |         context.run_migrations()
54 | 
55 | 
56 | def run_migrations_online():
57 |     """Run migrations in 'online' mode.
58 | 
59 |     In this scenario we need to create an Engine
60 |     and associate a connection with the context.
61 | 
62 |     """
63 | 
64 |     # this callback is used to prevent an auto-migration from being generated
65 |     # when there are no changes to the schema
66 |     # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
67 |     def process_revision_directives(context, revision, directives):
68 |         if getattr(config.cmd_opts, 'autogenerate', False):
69 |             script = directives[0]
70 |             if script.upgrade_ops.is_empty():
71 |                 directives[:] = []
72 |                 logger.info('No changes in schema detected.')
73 | 
74 |     connectable = engine_from_config(
75 |         config.get_section(config.config_ini_section),
76 |         prefix='sqlalchemy.',
77 |         poolclass=pool.NullPool,
78 |     )
79 | 
80 |     with connectable.connect() as connection:
81 |         context.configure(
82 |             connection=connection,
83 |             target_metadata=target_metadata,
84 |             process_revision_directives=process_revision_directives,
85 |             **current_app.extensions['migrate'].configure_args
86 |         )
87 | 
88 |         with context.begin_transaction():
89 |             context.run_migrations()
90 | 
91 | 
92 | if context.is_offline_mode():
93 |     run_migrations_offline()
94 | else:
95 |     run_migrations_online()
96 | 


--------------------------------------------------------------------------------
/services/engine/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/services/engine/migrations/versions/1569921cac58_加入响应时间和渲染时间.py:
--------------------------------------------------------------------------------
 1 | """加入响应时间和渲染时间
 2 | 
 3 | Revision ID: 1569921cac58
 4 | Revises: b3bd5bc9e4e3
 5 | Create Date: 2021-04-07 17:11:13.336649
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '1569921cac58'
13 | down_revision = 'b3bd5bc9e4e3'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     # ### commands auto generated by Alembic - please adjust! ###
20 |     op.add_column('results', sa.Column('load_complete_time', sa.Integer(), nullable=True))
21 |     op.add_column('results', sa.Column('response_time', sa.Integer(), nullable=True))
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('results', 'response_time')
28 |     op.drop_column('results', 'load_complete_time')
29 |     # ### end Alembic commands ###
30 | 


--------------------------------------------------------------------------------
/services/engine/migrations/versions/5b189e0161ee_加入网站编码.py:
--------------------------------------------------------------------------------
 1 | """加入网站编码
 2 | 
 3 | Revision ID: 5b189e0161ee
 4 | Revises: 1569921cac58
 5 | Create Date: 2021-04-19 10:32:15.201074
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '5b189e0161ee'
14 | down_revision = '1569921cac58'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('results', sa.Column('charset', sa.String(length=256), nullable=True))
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('results', 'charset')
28 |     # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/services/engine/migrations/versions/71bf761944f8_新增网站图标字段.py:
--------------------------------------------------------------------------------
 1 | """新增网站图标字段
 2 | 
 3 | Revision ID: 71bf761944f8
 4 | Revises: 4a243739ef84
 5 | Create Date: 2021-01-21 16:39:56.687514
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '71bf761944f8'
14 | down_revision = '4a243739ef84'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('results', sa.Column('favicon_link', sa.String(length=1024), nullable=True))
22 |     op.add_column('results', sa.Column('favicon_md5', sa.String(length=50), nullable=True))
23 |     # ### end Alembic commands ###
24 | 
25 | 
26 | def downgrade():
27 |     # ### commands auto generated by Alembic - please adjust! ###
28 |     op.drop_column('results', 'favicon_md5')
29 |     op.drop_column('results', 'favicon_link')
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/services/engine/migrations/versions/b3bd5bc9e4e3_增加extra客户端额外数据.py:
--------------------------------------------------------------------------------
 1 | """增加extra客户端额外数据
 2 | 
 3 | Revision ID: b3bd5bc9e4e3
 4 | Revises: 71bf761944f8
 5 | Create Date: 2021-01-25 17:52:28.285830
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'b3bd5bc9e4e3'
14 | down_revision = '71bf761944f8'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('tasks', sa.Column('extra_data', sa.Text(), nullable=True))
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('tasks', 'extra_data')
28 |     # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/services/engine/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.2
 2 | alembic==1.4.3
 3 | amqp==2.6.1
 4 | APScheduler==3.6.3
 5 | async-timeout==3.0.1
 6 | attrs==20.3.0
 7 | beautifulsoup4==4.9.3
 8 | billiard==3.6.3.0
 9 | celery==4.3.0
10 | certifi==2020.11.8
11 | chardet==3.0.4
12 | click==7.1.2
13 | configobj==5.0.6
14 | Cython==0.29.21
15 | Flask==1.1.2
16 | Flask-Migrate==2.4.0
17 | Flask-Redis==0.3.0
18 | Flask-SQLAlchemy==2.3.2
19 | gevent==1.4.0
20 | greenlet==0.4.15
21 | grpcio==1.33.2
22 | grpcio-tools==1.33.2
23 | gunicorn==19.9.0
24 | html2text==2020.1.16
25 | idna==2.8
26 | importlib-metadata==2.0.0
27 | itsdangerous==1.1.0
28 | Jinja2==2.11.2
29 | kombu==4.6.11
30 | Mako==1.1.3
31 | MarkupSafe==1.1.1
32 | marshmallow==2.19.2
33 | multidict==5.0.2
34 | Pillow==8.2.0
35 | plumbum==1.6.9
36 | protobuf==3.12.2 --no-binary protobuf
37 | psycopg2-binary==2.7.6.1
38 | python-dateutil==2.8.1
39 | python-editor==1.0.4
40 | pytz==2020.4
41 | raven==6.10.0
42 | redis==3.5.3
43 | requests==2.22.0
44 | rpyc==4.1.5
45 | six==1.15.0
46 | soupsieve==2.1
47 | SQLAlchemy==1.3.20
48 | typing-extensions==3.7.4.3
49 | tzlocal==2.1
50 | urllib3==1.25.11
51 | vine==1.3.0
52 | webargs==4.0.0
53 | Werkzeug==1.0.1
54 | yarl==1.6.3
55 | zipp==3.4.0
56 | 


--------------------------------------------------------------------------------
/services/engine/rpc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/rpc/client/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/rpc/client/callback_client.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import os
 4 | 
 5 | import grpc
 6 | 
 7 | from rpc.pb import callback_pb2
 8 | from rpc.pb.callback_pb2_grpc import ResultStub
 9 | 
10 | CHUNK_SIZE = 10 * 1024
11 | 
12 | 
13 | class CallbackClient(object):
14 | 
15 |     def __init__(self, rpc_server):
16 |         # RPC服务器信道
17 |         channel = grpc.insecure_channel(target=f'{rpc_server}', options=[
18 |             ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024),
19 |             ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024),
20 |         ])
21 | 
22 |         # 获取Result grpc服务对象
23 |         self.stub = ResultStub(channel)
24 | 
25 |     def callback_save_result(self, task_obj, result):
26 |         """
27 |         回调爬虫信息
28 |         :return:
29 |         """
30 |         result['extra_data'] = task_obj.extra_data
31 |         self.stub.SaveResult(
32 |             callback_pb2.SaveResultRequest(
33 |                 customer_id=task_obj.customer_id,
34 |                 task_id=task_obj.id,
35 |                 task_status='executing',
36 |                 finished=False,
37 |                 crawl_results=json.dumps(result, ensure_ascii=False)
38 |             ),
39 |             timeout=30
40 |         )
41 | 
42 |     def callback_task_finished(self, customer_id, task_id):
43 |         """回调任务完成"""
44 | 
45 |         self.stub.SaveResult(
46 |             callback_pb2.SaveResultRequest(
47 |                 customer_id=customer_id, task_id=task_id,
48 |                 finished=True, task_status='finished'),
49 |             timeout=30
50 |         )
51 | 


--------------------------------------------------------------------------------
/services/engine/rpc/codegen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 声明proto路径和pb文件生成路径
 4 | declare -a proto_path=("protos")
 5 | declare -a python_out=("pb")
 6 | 
 7 | # 构造pb文件
 8 | python -m grpc_tools.protoc \
 9 |         --proto_path=$proto_path/ \
10 |         --python_out=$python_out \
11 |         --grpc_python_out=$python_out \
12 |         $proto_path/*.proto
13 | 
14 | # 替换pb文件的错误引入语句
15 | sed -i '' -E 's/^import (.*pb2)/from . import \1/g' ${python_out}/*pb2*.py


--------------------------------------------------------------------------------
/services/engine/rpc/pb/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/rpc/pb/callback_pb2_grpc.py:
--------------------------------------------------------------------------------
 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 2 | """Client and server classes corresponding to protobuf-defined services."""
 3 | import grpc
 4 | 
 5 | from . import callback_pb2 as result__pb2
 6 | 
 7 | 
 8 | class ResultStub(object):
 9 |     """定义服务
10 |     """
11 | 
12 |     def __init__(self, channel):
13 |         """Constructor.
14 | 
15 |         Args:
16 |             channel: A grpc.Channel.
17 |         """
18 |         self.SaveResult = channel.unary_unary(
19 |             '/result.Result/SaveResult',
20 |             request_serializer=result__pb2.SaveResultRequest.SerializeToString,
21 |             response_deserializer=result__pb2.SaveResultResponse.FromString,
22 |         )
23 | 
24 | 
25 | class ResultServicer(object):
26 |     """定义服务
27 |     """
28 | 
29 |     def SaveResult(self, request, context):
30 |         """保存基本爬取信息
31 |         """
32 |         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
33 |         context.set_details('Method not implemented!')
34 |         raise NotImplementedError('Method not implemented!')
35 | 
36 | 
37 | def add_ResultServicer_to_server(servicer, server):
38 |     rpc_method_handlers = {
39 |         'SaveResult': grpc.unary_unary_rpc_method_handler(
40 |             servicer.SaveResult,
41 |             request_deserializer=result__pb2.SaveResultRequest.FromString,
42 |             response_serializer=result__pb2.SaveResultResponse.SerializeToString,
43 |         ),
44 |     }
45 |     generic_handler = grpc.method_handlers_generic_handler(
46 |         'result.Result', rpc_method_handlers)
47 |     server.add_generic_rpc_handlers((generic_handler,))
48 | 
49 | 
50 | # This class is part of an EXPERIMENTAL API.
51 | class Result(object):
52 |     """定义服务
53 |     """
54 | 
55 |     @staticmethod
56 |     def SaveResult(request,
57 |                    target,
58 |                    options=(),
59 |                    channel_credentials=None,
60 |                    call_credentials=None,
61 |                    insecure=False,
62 |                    compression=None,
63 |                    wait_for_ready=None,
64 |                    timeout=None,
65 |                    metadata=None):
66 |         return grpc.experimental.unary_unary(request, target, '/result.Result/SaveResult',
67 |                                              result__pb2.SaveResultRequest.SerializeToString,
68 |                                              result__pb2.SaveResultResponse.FromString,
69 |                                              options, channel_credentials,
70 |                                              insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
71 | 


--------------------------------------------------------------------------------
/services/engine/rpc/protos/result.proto:
--------------------------------------------------------------------------------
  1 | // 使用proto3协议
  2 | syntax = "proto3";
  3 | 
  4 | // 当前包
  5 | package result;
  6 | 
  7 | 
  8 | // 定义服务
  9 | service Result {
 10 |   // 保存基本爬取信息
 11 |   rpc SaveBaseResult(SaveBaseResultRequest)  returns (SaveBaseResultResponse);
 12 |   // 基于流式传输图片
 13 |   rpc StreamUploadPicture(stream StreamUploadPictureRequest) returns (StreamUploadPictureResponse);
 14 |   // 基于流式传输har文件
 15 |   rpc StreamUploadHarFile(stream StreamUploadHarFileRequest) returns (StreamUploadHarFileResponse);
 16 |   // 标记子任务爬取状态
 17 |   rpc SetSubTaskStatus(SetSubTaskStatusRequest) returns (SetSubTaskStatusResponse);
 18 | }
 19 | 
 20 | 
 21 | // 请求参数
 22 | message SaveBaseResultRequest {
 23 |   int32 subtask_id = 1;
 24 |   int32 url_id = 2;
 25 |   string url_address = 3;
 26 |   int32 http_code = 4;
 27 |   string title = 5;
 28 |   string content = 6;
 29 |   string current_url = 7;
 30 |   string screenshot_id = 8;
 31 |   string response_headers = 9;
 32 |   string finished_at = 10;
 33 |   string har_uuid = 11;
 34 |   repeated RedirectChain redirect_chain = 12;
 35 |   repeated Cookies cookies = 13;
 36 |   int32 response_time = 14;
 37 |   int32 load_complete_time = 15;
 38 | }
 39 | 
 40 | 
 41 | // 网站重定向链接链表
 42 | message RedirectChain {
 43 |   string redirect_url = 1;
 44 |   int32 redirect_http_code = 2;
 45 | }
 46 | 
 47 | 
 48 | // Cookies
 49 | message Cookies {
 50 |   string name = 1;
 51 |   string path = 2;
 52 |   string value = 3;
 53 |   string domain = 4;
 54 |   bool secure = 5;
 55 |   int64 expires = 6;
 56 |   bool httpOnly = 7;
 57 |   string sameSite = 8;
 58 | }
 59 | 
 60 | 
 61 | // 响应
 62 | message SaveBaseResultResponse{
 63 |   bool status = 1;
 64 | }
 65 | 
 66 | // 图片流
 67 | message StreamUploadPictureRequest {
 68 |   message FileData {
 69 |     string filename = 1;
 70 |     bytes buffer = 2;
 71 |   }
 72 |   oneof payload {
 73 |     string filename = 1;
 74 |     FileData file_data = 2;
 75 |   }
 76 | }
 77 | 
 78 | // 图片长度
 79 | message StreamUploadPictureResponse {
 80 |   int32 length = 1;
 81 | }
 82 | 
 83 | // 子任务id、状态
 84 | message SetSubTaskStatusRequest {
 85 |   int32 subtask_id = 1;
 86 |   bool status = 2;
 87 |   string finished_at = 3;
 88 | }
 89 | 
 90 | // 设置子任务状态响应
 91 | message SetSubTaskStatusResponse {
 92 |   bool set_success = 1;
 93 | }
 94 | 
 95 | // har文件流
 96 | message StreamUploadHarFileRequest {
 97 |   message FileData {
 98 |     string filename = 1;
 99 |     bytes buffer = 2;
100 |   }
101 |   oneof payload {
102 |     string filename = 1;
103 |     FileData file_data = 2;
104 |   }
105 | }
106 | 
107 | // har文件流大小
108 | message StreamUploadHarFileResponse {
109 |   int32 length = 1;
110 | }


--------------------------------------------------------------------------------
/services/engine/rpc/server/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/rpc/server/result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | from google.protobuf.json_format import MessageToDict
 6 | 
 7 | from manage import app
 8 | from rpc.pb import result_pb2, result_pb2_grpc
 9 | from webs.api.models.db_proxy import subtask_model_proxy, task_model_proxy, schedule_task_proxy, apscheduler_model_proxy
10 | from worker import celery_app
11 | 
12 | 
13 | def save_chunks_to_file(request_streams, folder_path):
14 |     filename, file_chunks = None, []
15 |     for yield_obj in request_streams:
16 |         if getattr(yield_obj, 'filename'):
17 |             filename = yield_obj.filename
18 |         else:
19 |             file_chunks.append(yield_obj.file_data)
20 |     with open(f'/usr/src/app/{folder_path}/{filename}', 'wb') as f:
21 |         for chunk in file_chunks:
22 |             f.write(chunk.buffer)
23 |     return os.path.getsize(f'/usr/src/app/{folder_path}/{filename}')
24 | 
25 | 
26 | class ResultServicer(result_pb2_grpc.ResultServicer):
27 |     # 创建截图目录
28 |     if not os.path.exists('/usr/src/app/screenshots'):
29 |         os.mkdir('/usr/src/app/screenshots')
30 | 
31 |     # 实现SaveBaseResult方法的rpc调用
32 |     def SaveBaseResult(self, request, context):
33 |         # proto消息体参数转为字典
34 |         request_dict = MessageToDict(request, preserving_proto_field_name=True)
35 | 
36 |         # 异步处理相关爬取数据
37 |         celery_app.send_task('save_base_result', queue='base_result', kwargs=request_dict)
38 | 
39 |         return result_pb2.SaveBaseResultResponse(status=True)
40 | 
41 |     # 实现StreamUploadPicture流式处理传输图片的rpc调用
42 |     def StreamUploadPicture(self, request_iterator, context):
43 |         try:
44 |             file_length = save_chunks_to_file(request_iterator, folder_path='screenshots')
45 |         except FileNotFoundError:
46 |             file_length = 0
47 |         return result_pb2.StreamUploadPictureResponse(length=file_length)
48 | 
49 |     # 实现SetSubTaskStatus标记子任务处理状态
50 |     def SetSubTaskStatus(self, request, context):
51 |         # 在flask上下文中设置子任务状态
52 |         with app.app_context():
53 |             ###### 设置子任务状态
54 |             subtask_obj = subtask_model_proxy.set_many_attr(
55 |                 obj_id=request.subtask_id,
56 |                 fields_v={'finished': request.status, 'finished_at': request.finished_at}
57 |             )
58 |             ###### 设置调度任务状态
59 |             unfinished_count = subtask_model_proxy.query_unfinished_subtask_count(subtask_obj.schedule_task_id)
60 |             if unfinished_count == 0:
61 |                 schedule_task_obj = schedule_task_proxy.query_schedule_task_obj_by_subtask_id(subtask_obj.id)
62 |                 schedule_task_proxy.set_many_attr(
63 |                     obj=schedule_task_obj, fields_v={'schedule_task_status': 'finished', 'finished': True}
64 |                 )
65 | 
66 |                 # 查询主任务
67 |                 task_id, running_schedule_tasks = schedule_task_proxy.query_running_task_and_task_id(
68 |                     subtask_obj.schedule_task_id)
69 |                 task_obj = task_model_proxy.find(id=task_id)
70 | 
71 |                 # 回调当前调度任务完成
72 |                 if schedule_task_obj.crawl_options.get('callback_type'):
73 |                     from rpc.client.callback_client import CallbackClient
74 |                     try:
75 |                         callback_client = CallbackClient(rpc_server=task_obj.crawl_options.get('callback_address'))
76 |                         callback_client.callback_task_finished(customer_id=task_obj.customer_id, task_id=task_id)
77 |                     except Exception as e:
78 |                         print(e)
79 |                         print(f"回调任务完成失败：ID-{task_id}")
80 | 
81 |                 # 设置主任务为完结状态
82 |                 next_run_time = apscheduler_model_proxy.get_next_run_time(apschedule_id=task_id)
83 |                 if not running_schedule_tasks and not next_run_time:
84 |                     task_model_proxy.set_many_attr(
85 |                         obj=task_obj, fields_v={'task_status': 'finished', 'finished': True}
86 |                     )
87 | 
88 |         return result_pb2.SetSubTaskStatusResponse(set_success=True)
89 | 
90 |     # 实现StreamUploadHarFile流式处理传输文件的rpc调用
91 |     def StreamUploadHarFile(self, request_iterator, context):
92 |         try:
93 |             file_length = save_chunks_to_file(request_iterator, folder_path='hars')
94 |         except FileNotFoundError:
95 |             file_length = 0
96 |         return result_pb2.StreamUploadPictureResponse(length=file_length)
97 | 


--------------------------------------------------------------------------------
/services/engine/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free


--------------------------------------------------------------------------------
/services/engine/wappalyzer/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from wappalyzer.wappalyzerhandler import WappalyzerHandler
4 | 
5 | wappalyzer_handler = WappalyzerHandler(techno_path='wappalyzer/data.json')
6 | 


--------------------------------------------------------------------------------
/services/engine/wappalyzer/helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | from typing import Any
  6 | import re
  7 | 
  8 | from wappalyzer.modelcalss import Pattern
  9 | 
 10 | 
 11 | def _transform_patterns(
 12 |         patterns: Any,
 13 |         case_sensitive: bool = False
 14 | ):
 15 |     """Canonicalize the patterns of different sections.
 16 |     """
 17 | 
 18 |     def to_list(value):
 19 |         return value if type(value) is list else [value]
 20 | 
 21 |     if not patterns:
 22 |         return []
 23 | 
 24 |     if type(patterns) is str or type(patterns) is list:
 25 |         patterns = {
 26 |             "main": patterns
 27 |         }
 28 | 
 29 |     parsed = {}
 30 |     for key in patterns:
 31 |         name = key if case_sensitive else key.lower()
 32 |         parsed[name] = [
 33 |             _parse_pattern(ptrn, key)
 34 |             for ptrn in to_list(patterns[key])
 35 |         ]
 36 | 
 37 |     return parsed["main"] if "main" in parsed else parsed
 38 | 
 39 | 
 40 | def _parse_pattern(pattern: str, key: str = ""):
 41 |     """Parse the regex pattern and creates a Pattern object.
 42 |     It extracts the regex, the version and the confidence values of
 43 |     the given string.
 44 |     """
 45 |     parts = pattern.split("\\;")
 46 | 
 47 |     value = parts[0]
 48 | 
 49 |     # seems that in js "[^]" is similar to ".", however python
 50 |     # re interprets in a diferent way (which leads to an error),
 51 |     # so it is better to substitute it
 52 |     regex = value.replace("/", "\\/").replace("[^]", ".")
 53 | 
 54 |     attrs = {
 55 |         "value": value,
 56 |         "regex": re.compile(regex, re.I)
 57 |     }
 58 |     for attr in parts[1:]:
 59 |         attr = attr.split(":")
 60 |         if len(attr) > 1:
 61 |             attrs[attr[0]] = ":".join(attr[1:])
 62 | 
 63 |     return Pattern(
 64 |         value=attrs["value"],
 65 |         regex=attrs["regex"],
 66 |         confidence=int(attrs.get("confidence", 100)),
 67 |         version=attrs.get("version", ""),
 68 |         key=key,
 69 |     )
 70 | 
 71 | 
 72 | def extract_scripts(html: str):
 73 |     soup = BeautifulSoup(html, "html.parser")
 74 |     script_tags = soup.findAll("script")
 75 | 
 76 |     scripts = []
 77 |     for script_tag in script_tags:
 78 |         try:
 79 |             src = script_tag.attrs["src"]
 80 |             if not src.startswith("data:text/javascript;"):
 81 |                 scripts.append(src)
 82 |         except KeyError:
 83 |             pass
 84 | 
 85 |     return scripts
 86 | 
 87 | 
 88 | def extract_metas(html: str):
 89 |     soup = BeautifulSoup(html, "html.parser")
 90 |     meta_tags = soup.findAll("meta")
 91 | 
 92 |     metas = {}
 93 |     for meta_tag in meta_tags:
 94 |         try:
 95 |             key = meta_tag.attrs.get("name", None) \
 96 |                   or meta_tag.attrs["property"]
 97 |             metas[key.lower()] = [meta_tag.attrs["content"]]
 98 |         except KeyError:
 99 |             continue
100 | 
101 |     return metas
102 | 
103 | 
104 | def extract_cookies(cookies_list):
105 |     cookies_dict = {}
106 |     for each_cookie in cookies_list:
107 |         cookies_dict.update({each_cookie['name']: each_cookie['value']})
108 |     return cookies_dict
109 | 
110 | 
111 | def extract_headers(headers):
112 |     return {
113 |         k.lower(): [v]
114 |         for k, v in headers.items()
115 |     }
116 | 


--------------------------------------------------------------------------------
/services/engine/wappalyzer/modelcalss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | class Technology:
  4 |     def __init__(
  5 |             self,
  6 |             name: str,
  7 |             categories,
  8 |             url,
  9 |             headers,
 10 |             cookies,
 11 |             html,
 12 |             meta,
 13 |             scripts,
 14 |             js,
 15 |             implies,
 16 |             excludes,
 17 |             icon: str,
 18 |             website: str,
 19 |             cpe: str,
 20 |     ):
 21 |         self.name = name
 22 |         self.categories = categories
 23 |         self.url = url
 24 |         self.headers = headers
 25 |         self.cookies = cookies
 26 |         self.html = html
 27 |         self.meta = meta
 28 |         self.scripts = scripts
 29 |         self.js = js
 30 |         self.implies = implies
 31 |         self.excludes = excludes
 32 |         self.icon = icon
 33 |         self.website = website
 34 |         self.cpe = cpe
 35 | 
 36 |     def __getitem__(self, k):
 37 |         return self.__dict__[k]
 38 | 
 39 |     def get(self, *args, **kwargs):
 40 |         return self.__dict__.get(*args, **kwargs)
 41 | 
 42 |     def __repr__(self):
 43 |         return repr(self.__dict__)
 44 | 
 45 | 
 46 | class Category:
 47 |     def __init__(self, id: str, name: str, priority: int):
 48 |         self.id = id
 49 |         self.name = name
 50 |         self.priority = priority
 51 | 
 52 | 
 53 | class Pattern:
 54 | 
 55 |     def __init__(
 56 |             self,
 57 |             value: str,
 58 |             regex,
 59 |             confidence: int,
 60 |             version: str,
 61 |             key: str
 62 |     ):
 63 |         self.value = value
 64 |         self.regex = regex
 65 |         self.confidence = confidence
 66 |         self.version = version
 67 |         self.key = key
 68 | 
 69 |     def __getitem__(self, k):
 70 |         return self.__dict__[k]
 71 | 
 72 |     def __repr__(self):
 73 |         return repr(self.__dict__)
 74 | 
 75 | 
 76 | class Imply:
 77 |     """Structure to define a technology that is implied by the use of another
 78 |     one.
 79 | 
 80 |     Attributes:
 81 |         name (str): Name of the implied technology.
 82 |         confidence (int): Confidence of the implied technology.
 83 | 
 84 |     """
 85 | 
 86 |     def __init__(self, name: str, confidence: int):
 87 |         self.name = name
 88 |         self.confidence = confidence
 89 | 
 90 | 
 91 | class Exclude:
 92 |     """Structure to define a technology that is incompatible with another
 93 |     one.
 94 | 
 95 |     Attributes:
 96 |         name (str): Name of the excluded technology.
 97 | 
 98 |     """
 99 | 
100 |     def __init__(self, name: str):
101 |         self.name = name
102 | 
103 | 
104 | class PatternMatch:
105 |     """Identifies a match in a technology pattern.
106 | 
107 |     Attributes:
108 |         technology (Technology): Technology identified by the pattern.
109 |         pattern (Pattern): Pattern that cause the match.
110 |         version (str): Version identified by the pattern in the match.
111 |     """
112 | 
113 |     def __init__(self, technology: Technology, pattern: Pattern, version: str):
114 |         self.technology = technology
115 |         self.pattern = pattern
116 |         self.version = version
117 | 
118 |     def __getitem__(self, k):
119 |         return self.__dict__[k]
120 | 
121 |     def __repr__(self):
122 |         return repr(self.__dict__)
123 | 
124 |     def __eq__(self, o):
125 |         return (
126 |                 self.technology.name == o.technology.name
127 |                 and self.pattern.key == self.pattern.key
128 |                 and self.pattern.value == self.pattern.value
129 |         )
130 | 
131 |     def __hash__(self):
132 |         return hash(
133 |             (self.technology.name, self.pattern.key, self.pattern.value)
134 |         )
135 | 
136 | 
137 | class TechMatch:
138 |     """Identifies a match in a technology.
139 | 
140 |     Attributes:
141 |         technology (Technology): Technology identified.
142 |         confidence (int): Confidence in the match, is derivated from all the
143 |             patterns of this technology that matched.
144 |         version (str): Version identified by the patterns.
145 |     """
146 | 
147 |     def __init__(self, technology: Technology, confidence: int, version: str):
148 |         self.technology = technology
149 |         self.confidence = confidence
150 |         self.version = version
151 | 
152 |     def __getitem__(self, k):
153 |         return self.__dict__[k]
154 | 
155 |     def __repr__(self):
156 |         return repr(self.__dict__)
157 | 
158 |     def __eq__(self, o):
159 |         return self.technology.name == o.technology.name
160 | 


--------------------------------------------------------------------------------
/services/engine/webs/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | from flask import Flask
 6 | 
 7 | from webs.api.utils.requests import before_request_middleware, \
 8 |     after_request_middleware, teardown_appcontext_middleware
 9 | from webs.api.utils.responses import JSONResponse, app_error_handler
10 | from webs.api.utils.routers import register_routes as init_routes
11 | from webs.api.utils.settings import init_db
12 | 
13 | 
14 | def create_app():
15 |     # instantiate the app
16 |     app = Flask(__name__)
17 | 
18 |     # set config
19 |     app_settings = os.getenv('APP_SETTINGS')
20 |     app.config.from_object(app_settings)
21 | 
22 |     # register all blueprints
23 |     init_routes(app=app)
24 | 
25 |     # register custom response class
26 |     app.response_class = JSONResponse
27 | 
28 |     # register custom error handler
29 |     app_error_handler(app=app)
30 | 
31 |     # register before request middleware
32 |     before_request_middleware(app=app)
33 | 
34 |     # register after request middleware
35 |     after_request_middleware(app=app)
36 | 
37 |     # register after app context teardown middleware
38 |     teardown_appcontext_middleware(app=app)
39 | 
40 |     # set up extensions
41 |     app_db = init_db(app=app)
42 | 
43 |     # shell context for flask cli
44 |     @app.shell_context_processor
45 |     def ctx():
46 |         return {'app': app, 'db': app_db}
47 | 
48 |     return app
49 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/bizs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/bizs/result.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import base64
  4 | import os
  5 | from io import BytesIO
  6 | 
  7 | from PIL import Image
  8 | from flask import make_response, send_file
  9 | 
 10 | from webs.api.exceptions.customs import RecordNotFound, InvalidAPIRequest
 11 | from webs.api.models.db_proxy import result_model_proxy
 12 | 
 13 | 
 14 | class ResultBiz(object):
 15 | 
 16 |     def result_by_url(self, url, fields):
 17 |         """
 18 |         根据url查询结果
 19 |         :param url:
 20 |         :param fields:
 21 |         :return:
 22 |         """
 23 | 
 24 |         # 获取url id
 25 | 
 26 |         return result_model_proxy.get_by_url(url, fields)
 27 | 
 28 |     def result_by_id(self, result_id):
 29 |         """
 30 |         根据result查询结果
 31 |         :param result_id:
 32 |         :return:
 33 |         """
 34 |         return result_model_proxy.get_by_result_id(result_id)
 35 | 
 36 |     def get_screenshot(self, screenshot_id, download=False):
 37 |         """
 38 |         获取截图
 39 |         :param screenshot_id:
 40 |         :param download:
 41 |         :return:
 42 |         """
 43 | 
 44 |         screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
 45 |         if not os.path.exists(screenshot_path):
 46 |             raise RecordNotFound('截图不存在！')
 47 |         response = make_response(send_file(
 48 |             filename_or_fp=screenshot_path,
 49 |             as_attachment=download
 50 |         ))
 51 |         response.direct_passthrough = False
 52 |         return response
 53 | 
 54 |     def get_screenshot_base64_encode(self, screenshot_id):
 55 |         """
 56 |         获取截图Base64编码
 57 |         :param screenshot_id:
 58 |         :return:
 59 |         """
 60 |         screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
 61 |         if not os.path.exists(screenshot_path):
 62 |             raise RecordNotFound('截图不存在！')
 63 |         with open(screenshot_path, 'rb') as f:
 64 |             img_encode_str = base64.b64encode(f.read()).decode('utf-8')
 65 |         return img_encode_str
 66 | 
 67 |     def download_har(self, har_uuid):
 68 |         """
 69 |         下载har文件
 70 |         :param har_uuid:
 71 |         :return:
 72 |         """
 73 |         har_path = f'/usr/src/app/hars/{har_uuid}.json'
 74 |         if not os.path.exists(har_path):
 75 |             raise RecordNotFound('该文件不存在！')
 76 |         response = make_response(send_file(
 77 |             filename_or_fp=har_path,
 78 |             as_attachment=True
 79 |         ))
 80 |         response.direct_passthrough = False
 81 |         return response
 82 | 
 83 |     def get_favicon(self, favicon_md5, download=False):
 84 |         """
 85 |         获取图标
 86 |         :param favicon_md5:
 87 |         :param download:
 88 |         :return:
 89 |         """
 90 | 
 91 |         newest_record = result_model_proxy.find(favicon_md5=favicon_md5)
 92 |         if not newest_record:
 93 |             raise RecordNotFound('图标不存在！')
 94 |         _, ext = os.path.splitext(newest_record.favicon_link)
 95 |         favicon_path = f'/usr/src/app/screenshots/{favicon_md5}{ext}'
 96 |         if not os.path.exists(favicon_path):
 97 |             raise RecordNotFound('图标不存在！')
 98 |         response = make_response(send_file(
 99 |             filename_or_fp=favicon_path,
100 |             as_attachment=download
101 |         ))
102 |         response.direct_passthrough = False
103 |         return response
104 | 
105 |     def get_small_screenshot(self, screenshot_id, wide, high):
106 |         """查看图片缩略图"""
107 | 
108 |         screenshot_path = f'/usr/src/app/screenshots/{screenshot_id}.png'
109 |         if not os.path.exists(screenshot_path):
110 |             raise RecordNotFound('截图不存在！')
111 |         im = Image.open(f'/usr/src/app/screenshots/{screenshot_id}.png')
112 | 
113 |         src_wide, src_high = im.size
114 |         ratio = src_wide / wide
115 |         im = im.resize((wide, int(src_high / ratio)), Image.ANTIALIAS)
116 |         im = im.crop((0, 0, wide, high))
117 | 
118 |         # 存入临时内存中
119 |         byte_io = BytesIO()
120 |         im.save(byte_io, 'PNG')
121 |         byte_io.seek(0)
122 | 
123 |         response = make_response(send_file(
124 |             filename_or_fp=byte_io,
125 |             as_attachment=False,
126 |             mimetype='image/png'
127 |             # attachment_filename=f'{screenshot_id}.png'
128 |         ))
129 |         response.direct_passthrough = False
130 |         return response
131 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/exceptions/customs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from werkzeug.exceptions import BadRequest, \
  5 |     NotFound, Unauthorized, Forbidden, InternalServerError, Conflict
  6 | 
  7 | 
  8 | class CustomException(Exception):
  9 |     """Custom JSON based exception."""
 10 | 
 11 |     status_code = BadRequest.code
 12 |     message = ""
 13 | 
 14 |     def __init__(self, message=None, status_code=None):
 15 |         """
 16 |         :param status_code: response status_code
 17 |         :param message: exception message
 18 |         """
 19 | 
 20 |         Exception.__init__(self)
 21 | 
 22 |         if message is not None:
 23 |             self.message = message
 24 |         if status_code is not None:
 25 |             self.status_code = status_code
 26 | 
 27 |     def to_dict(self):
 28 |         return {
 29 |             "status": False,
 30 |             "error": {
 31 |                 "message": self.message,
 32 |                 "type": str(self.__class__.__name__)
 33 |             }
 34 |         }
 35 | 
 36 | 
 37 | class InvalidContentType(CustomException):
 38 |     """
 39 |     Raised when an invalid Content-Type is provided.
 40 |     """
 41 | 
 42 |     status_code = BadRequest.code
 43 | 
 44 | 
 45 | class UnauthorizedAPIRequest(CustomException):
 46 |     """
 47 |     Raise if the user is not authorized.  Also used if you want to use HTTP
 48 |     basic auth.
 49 |     """
 50 | 
 51 |     status_code = Unauthorized.code
 52 | 
 53 | 
 54 | class InvalidPermissions(CustomException):
 55 |     """
 56 |     Raise if the user doesn't have the permission for the requested resource
 57 |     but was authenticated.
 58 |     """
 59 | 
 60 |     status_code = Forbidden.code
 61 | 
 62 | 
 63 | class InvalidAPIRequest(CustomException):
 64 |     """
 65 |     Raised when an invalid request has been made.
 66 |     (e.g. accessed unexisting url, the schema validation did
 67 |     not pass)
 68 |     """
 69 | 
 70 |     status_code = BadRequest.code
 71 | 
 72 | 
 73 | class ServerError(CustomException):
 74 |     """
 75 |     Generic internal error.
 76 |     Inherit this error for all subsequent
 77 |     errors that are related to database.
 78 |     """
 79 | 
 80 |     status_code = InternalServerError.code
 81 | 
 82 | 
 83 | class DatabaseError(CustomException):
 84 |     """
 85 |     Generic database interaction error.
 86 |     Inherit this error for all subsequent
 87 |     errors that are related to database.
 88 |     """
 89 | 
 90 |     status_code = InternalServerError.code
 91 | 
 92 | 
 93 | class RecordNotFound(DatabaseError):
 94 |     """
 95 |     Raised when the record was not found in the database.
 96 |     """
 97 | 
 98 |     status_code = NotFound.code
 99 | 
100 | 
101 | class RecordAlreadyExists(DatabaseError):
102 |     """
103 |     Raised in the case of violation of a unique constraint.
104 |     """
105 | 
106 |     status_code = Conflict.code
107 | 
108 | 
109 | class PublishError(CustomException):
110 |     """
111 |     Raised in the case of violation of a publish error.
112 |     """
113 | 
114 |     status_code = InternalServerError.code
115 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from .base_model import db, redis_store
 5 | from .task import Task, SubTask, ScheduleTaskRecord
 6 | from .url import Url
 7 | from .task_url import TaskUrl
 8 | from .server import Server
 9 | from .result import Result
10 | from .apscheduler_job import APSchedulerJobs
11 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/apscheduler_job.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from sqlalchemy import Column, types
 5 | from sqlalchemy.dialects import postgresql
 6 | 
 7 | from webs.api.models import db
 8 | 
 9 | """
10 | APScheduler任务存储表
11 | """
12 | 
13 | 
14 | class APSchedulerJobs(db.Model):
15 |     __tablename__ = 'apscheduler_jobs'
16 | 
17 |     id = Column(types.String(length=191), primary_key=True)
18 |     next_run_time = Column(postgresql.DOUBLE_PRECISION(precision=53), index=True)
19 |     job_state = Column(postgresql.BYTEA(), nullable=False)
20 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/base_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from flask_sqlalchemy import SQLAlchemy
4 | from flask_redis import FlaskRedis
5 | 
6 | db = SQLAlchemy()
7 | redis_store = FlaskRedis()
8 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .apschedule import ApschedulerModelProxy
 3 | from .result import ResultModelProxy
 4 | from .schedule_task import ScheduleTaskProxy
 5 | from .server import ServerModelProxy
 6 | from .task import TaskModelProxy
 7 | from .task_url import TaskUrlModelProxy
 8 | from .url import UrlModelProxy
 9 | from .subtask import SubTaskModelProxy
10 | 
11 | task_model_proxy = TaskModelProxy()
12 | schedule_task_proxy = ScheduleTaskProxy()
13 | url_model_proxy = UrlModelProxy()
14 | task_url_model_proxy = TaskUrlModelProxy()
15 | server_model_proxy = ServerModelProxy()
16 | subtask_model_proxy = SubTaskModelProxy()
17 | result_model_proxy = ResultModelProxy()
18 | apscheduler_model_proxy = ApschedulerModelProxy()
19 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/apschedule.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import time
 3 | from datetime import datetime
 4 | 
 5 | from webs.api.models import APSchedulerJobs
 6 | from webs.api.models.db_proxy.base import BaseModelProxy
 7 | 
 8 | 
 9 | class ApschedulerModelProxy(BaseModelProxy):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.model = APSchedulerJobs
13 | 
14 |     def get_next_run_time(self, apschedule_id):
15 |         """
16 |         获取下一次任务执行时间
17 |         :param apschedule_id:
18 |         :return:
19 |         """
20 |         schedule_obj = self.find(id=str(apschedule_id))
21 |         if schedule_obj and schedule_obj.next_run_time:
22 |             return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(schedule_obj.next_run_time))
23 |         return
24 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import Result
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class ResultModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = Result
11 | 
12 |     def save_model_by_grpc(self, **kwargs):
13 |         """
14 |         基于rpc回收爬虫数据
15 |         :param kwargs:
16 |         :return:
17 |         """
18 | 
19 |         obj = Result(
20 |             subtask_id=kwargs['subtask_id'], url_id=kwargs['url_id'], url_address=kwargs['url_address'],
21 |             http_code=kwargs.get('http_code'), title=kwargs.get('title'), content=kwargs.get('content'),
22 |             current_url=kwargs.get('current_url'), redirect_chain=kwargs.get('redirect_chain', []),
23 |             response_headers=kwargs.get('response_headers', {}), har_uuid=kwargs.get('har_uuid'),
24 |             screenshot_id=kwargs.get('screenshot_id'), finished_at=kwargs['finished_at'],
25 |             cookies=kwargs.get('cookies', []), wappalyzer_results=kwargs.get('wappalyzer_results', []),
26 |             text=kwargs.get('text'), favicon_md5=kwargs.get('favicon_md5'), favicon_link=kwargs.get('favicon_link'),
27 |             response_time=kwargs.get('response_time'), load_complete_time=kwargs.get('load_complete_time'),
28 |             charset=kwargs.get('charset')
29 |         )
30 |         self.db_session.add(obj)
31 |         self.db_session.flush()
32 |         self.safe_commit()
33 |         return obj
34 | 
35 |     def get_by_url(self, url, fields):
36 |         """
37 |         基于url取结果
38 |         :param url:
39 |         :param fields:
40 |         :return:
41 |         """
42 | 
43 |         # obj = self.self_session.filter(
44 |         #     or_(self.model.url_address == url.rstrip('/'), self.model.url_address == url.rstrip('/') + '/')) \
45 |         #     .order_by(self.model.finished_at.desc()).first()
46 |         from webs.api.models.db_proxy import url_model_proxy
47 |         url_obj = url_model_proxy.find(address=url)
48 |         if not url_obj:
49 |             return {}
50 | 
51 |         # 查询所有记录
52 |         objs = self.self_session.filter(self.model.url_id == url_obj.id) \
53 |             .order_by(self.model.id.desc()).all()
54 | 
55 |         latest_record = {}
56 |         if objs:
57 |             latest_record = objs[0].as_dict()
58 |             latest_record['other_records'] = [{
59 |                 'result_id': each.id,
60 |                 'finished_at': each.finished_at.strftime("%Y-%m-%d %H:%M:%S")}
61 |                 for each in objs[1:]
62 |             ]
63 | 
64 |         if fields: latest_record = {each: latest_record[each] for each in fields if each in latest_record}
65 | 
66 |         return latest_record
67 | 
68 |     def get_by_result_id(self, result_id):
69 |         """
70 |         基于id取结果
71 |         :param result_id:
72 |         :return:
73 |         """
74 | 
75 |         obj = self.self_session.filter(self.model.id == result_id).order_by(self.model.finished_at.desc()).first()
76 |         return {} if not obj else obj.as_dict()
77 | 
78 |     def get_favicon_data_by_url(self, url):
79 |         """
80 |         根据url获取已存在的图标信息
81 |         :param url:
82 |         :return:
83 |         """
84 | 
85 |         obj = self.db_session.query(self.model.favicon_md5, self.model.favicon_link) \
86 |             .filter(self.model.url_address == url).order_by(self.model.create_time.desc()).first()
87 |         return (None, None) if not obj else (obj[0], obj[1])
88 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/schedule_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import ScheduleTaskRecord, SubTask
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class ScheduleTaskProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = ScheduleTaskRecord
11 | 
12 |     def query_schedule_task_obj_by_subtask_id(self, subtask_id):
13 |         """
14 |         基于子任务查询调度任务
15 |         :return:
16 |         """
17 | 
18 |         return self.self_session \
19 |             .join(SubTask, SubTask.schedule_task_id == self.model.id) \
20 |             .filter(SubTask.id == subtask_id).first()
21 | 
22 |     def query_running_schedule_tasks(self, task_id):
23 |         """
24 |         查询正在执行中的调度任务
25 |         :param task_id:
26 |         :return:
27 |         """
28 |         return self.self_session.filter(
29 |             self.model.task_id == task_id,
30 |             self.model.finished.is_(False)
31 |         ).all()
32 | 
33 |     def query_running_task_and_task_id(self, schedule_task_id):
34 |         """
35 |         查询主任务下正在执行调度任务
36 |         :param schedule_task_id:
37 |         :return:
38 |         """
39 |         schedule_task_obj = self.find(id=schedule_task_id)
40 |         return schedule_task_obj.task_id, self.query_running_schedule_tasks(schedule_task_obj.task_id)
41 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sqlalchemy import desc, asc
 3 | 
 4 | from webs.api.models import Server
 5 | from webs.api.models.db_proxy.base import BaseModelProxy
 6 | 
 7 | 
 8 | class ServerModelProxy(BaseModelProxy):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.model = Server
12 | 
13 |     def query_servers_by_score(self, sort='desc'):
14 |         """
15 |         根据权重和负载计算服务器得分
16 |         :return:
17 |         """
18 | 
19 |         query = self.self_session.filter(self.model.enabled.is_(True), self.model.status.is_(True)).all()
20 |         results = [{
21 |             'server_id': each_obj.id,
22 |             'server_name': each_obj.server_name,
23 |             'server_address': each_obj.server_address,
24 |             'score': int((1 - float(each_obj.load)) * each_obj.weight * 10)
25 |         } for each_obj in query]
26 |         return sorted(results, key=lambda x: x['score'], reverse=True if sort == 'desc' else False)
27 | 
28 |     def add_server(self, address):
29 |         """新增爬虫服务器节点"""
30 |         obj = Server(server_name=address, server_address=address)
31 |         self.db_session.add(obj)
32 |         self.safe_commit()
33 |         return
34 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/subtask.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import SubTask
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class SubTaskModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = SubTask
11 | 
12 |     def create(self, schedule_task_id, server_id):
13 |         """
14 |         创建子任务
15 |         :param schedule_task_id:
16 |         :param server_id:
17 |         :return:
18 |         """
19 | 
20 |         obj = SubTask(schedule_task_id=schedule_task_id, server_id=server_id)
21 |         self.db_session.add(obj)
22 |         self.safe_commit()
23 |         return obj
24 | 
25 |     def query_delivery_failure_count(self, schedule_task_id):
26 |         """
27 |         查询下发失败的子任务
28 |         :return:
29 |         """
30 |         return self.self_session.filter(
31 |             self.model.schedule_task_id == schedule_task_id,
32 |             self.model.delivery_failure_msg.isnot(None)
33 |         ).count()
34 | 
35 |     def query_unfinished_subtask_count(self, schedule_task_id):
36 |         """
37 |         根据子任务id查询当前调度任务未完成的子任务数量
38 |         :param schedule_task_id:
39 |         :return:
40 |         """
41 |         return self.self_session.filter(
42 |             self.model.schedule_task_id == schedule_task_id, self.model.finished.is_(False)
43 |         ).count()
44 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import Task, SubTask, TaskUrl, Result, ScheduleTaskRecord
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class TaskModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = Task
11 | 
12 |     def create(self,
13 |                customer_id=None,
14 |                task_name=None,
15 |                extra_data=None,
16 |                task_status='executing',
17 |                crawl_options={},
18 |                **kwargs):
19 |         """
20 |         :return:
21 |         """
22 |         obj = Task(
23 |             customer_id=customer_id, task_name=task_name,
24 |             task_status=task_status, crawl_options=crawl_options, extra_data=extra_data,
25 |             schedule_options={'schedule_type': kwargs['schedule_type'], 'schedule_data': kwargs['schedule_data']})
26 |         self.db_session.add(obj)
27 |         self.db_session.flush()
28 |         self.safe_commit()
29 | 
30 |         return obj
31 | 
32 |     def query_task_obj_by_subtask(self, subtask_id):
33 |         """
34 |         通过子任务获取主任务模型对象
35 |         :param subtask_id:
36 |         :return:
37 |         """
38 | 
39 |         task_obj = self.db_session.query(self.model).select_from(self.model) \
40 |             .join(ScheduleTaskRecord, ScheduleTaskRecord.task_id == self.model.id) \
41 |             .join(SubTask, SubTask.schedule_task_id == ScheduleTaskRecord.id) \
42 |             .filter(SubTask.id == subtask_id) \
43 |             .first()
44 | 
45 |         return task_obj
46 | 
47 |     def query_url_count(self, task_id):
48 |         """
49 |         获取url总数
50 |         :param task_id:
51 |         :return:
52 |         """
53 | 
54 |         return self.db_session.query(TaskUrl).filter(TaskUrl.task_id == task_id).count()
55 | 
56 |     def query_crawl_url_count(self, task_id):
57 |         """
58 |         获取已爬取的url总数
59 |         :param task_id:
60 |         :return:
61 |         """
62 | 
63 |         return self.db_session.query(Result) \
64 |             .join(SubTask, Result.subtask_id == SubTask.id) \
65 |             .join(ScheduleTaskRecord, ScheduleTaskRecord.id == SubTask.schedule_task_id) \
66 |             .filter(ScheduleTaskRecord.task_id == task_id).count()
67 | 
68 |     def add_schedule_record(self, task_id, schedule_task_status, crawl_options):
69 |         """
70 |         增加调度记录
71 |         :param task_id:
72 |         :param schedule_task_status:
73 |         :param crawl_options:
74 |         :return:
75 |         """
76 |         obj = ScheduleTaskRecord(
77 |             task_id=task_id,
78 |             crawl_options=crawl_options,
79 |             schedule_task_status=schedule_task_status
80 |         )
81 |         self.db_session.add(obj)
82 |         self.safe_commit()
83 |         return obj
84 | 
85 |     def query_task_loop_count(self, task_id):
86 |         """
87 |         获取任务已跑轮次
88 |         :param task_id:
89 |         :return:
90 |         """
91 | 
92 |         return self.db_session.query(ScheduleTaskRecord).filter(ScheduleTaskRecord.task_id == task_id).count()
93 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/task_url.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import TaskUrl, Url
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class TaskUrlModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = TaskUrl
11 | 
12 |     def create(self, task_id, urls_id):
13 |         """
14 |         :return:
15 |         """
16 |         self.db_session.add_all(
17 |             [TaskUrl(task_id=task_id, url_id=url_id) for url_id in urls_id])
18 |         self.safe_commit()
19 | 
20 |     def create_subtask_url_mapping(self, chunk_url, subtask_id):
21 |         """
22 |         创建子任务与url映射关系
23 |         :param chunk_url:
24 |         :param subtask_id:
25 |         :return:
26 |         """
27 |         urls_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(chunk_url)).all()
28 |         self.self_session.filter(self.model.url_id.in_([each[0] for each in urls_query])).update(
29 |             {self.model.sub_task_id: subtask_id}, synchronize_session='fetch')
30 |         self.safe_commit()
31 |         return [{'url_id': each[0], 'url_address': each[1]} for each in urls_query]
32 | 
33 |     def query_urls_by_task_id(self, task_id):
34 |         """
35 |         根据task id查询关联的url
36 |         :param task_id:
37 |         :return:
38 |         """
39 | 
40 |         query = self.db_session.query(self.model.url_id, Url.address) \
41 |             .join(Url, Url.id == self.model.url_id) \
42 |             .filter(self.model.task_id == task_id) \
43 |             .all()
44 |         return [{
45 |             'url_id': each_obj[0], 'url_address': each_obj[1]}
46 |             for each_obj in query
47 |         ]
48 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/db_proxy/url.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import Url
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class UrlModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = Url
11 | 
12 |     def create(self, urls):
13 |         """
14 |         :return:
15 |         """
16 | 
17 |         # 检测系统中已存在的url
18 |         exist_url_query = self.db_session.query(Url.id, Url.address).filter(Url.address.in_(urls)).all()
19 |         exist_urls_id = [each[0] for each in exist_url_query]
20 | 
21 |         # 创建在系统中不存在的url
22 |         not_create_urls = set(urls).difference(set([each[1] for each in exist_url_query]))
23 |         create_url_models = [Url(address=url) for url in not_create_urls]
24 |         self.db_session.add_all(create_url_models)
25 |         self.safe_commit()
26 | 
27 |         exist_urls_id.extend([each.id for each in create_url_models])
28 |         return exist_urls_id
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | 存储结果
 5 | """
 6 | 
 7 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text
 8 | from sqlalchemy.dialects.postgresql import JSONB
 9 | 
10 | from webs.api.models import db
11 | 
12 | 
13 | class Result(db.Model):
14 |     __tablename__ = 'results'
15 | 
16 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
17 |     subtask_id = Column(Integer, nullable=False, index=True)  # 所属子任务任务id
18 |     url_id = Column(Integer, nullable=False, index=True)  # url id
19 |     url_address = Column(String(1024), nullable=False)  # url 地址
20 |     http_code = Column(Integer)  # 网站状态码
21 |     title = Column(Text)  # 网站标题
22 |     content = Column(Text)  # 网站内容
23 |     text = Column(Text)  # 网页正文
24 |     current_url = Column(String(1024))  # 网站最后相应的地址
25 |     redirect_chain = Column(JSONB)  # 重定向链接
26 |     response_headers = Column(JSONB)  # response headers
27 |     har_uuid = Column(String(128))  # 网站交互过程
28 |     screenshot_id = Column(String(128))  # 截图Id
29 |     cookies = Column(JSONB)  # cookies
30 |     finished_at = Column(TIMESTAMP)  # 完成时间
31 |     wappalyzer_results = Column(JSONB)  # 网站指纹
32 |     callback_failure_msg = Column(Text)  # 回调错误信息
33 |     favicon_md5 = Column(String(50))  # 网站图标hash值
34 |     favicon_link = Column(String(1024))  # 网站图标链接
35 |     response_time = Column(Integer)  # 网站响应时间
36 |     load_complete_time = Column(Integer)  # 页面加载完成时间
37 |     charset = Column(String(256))  # 网站编码
38 | 
39 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
40 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
41 | 
42 |     def __repr__(self):
43 |         return f'<Result-{self.id}>'
44 | 
45 |     def as_dict(self):
46 |         from webs.api.models.db_proxy import task_model_proxy
47 |         task_obj = task_model_proxy.query_task_obj_by_subtask(self.subtask_id)
48 | 
49 |         return {
50 |             'result_id': self.id,
51 |             'subtask_id': self.subtask_id,
52 |             'task_id': task_obj.id if task_obj else None,
53 |             'customer_id': task_obj.customer_id if task_obj else None,
54 |             'url_id': self.url_id,
55 |             'url_address': self.url_address,
56 |             'http_code': self.http_code,
57 |             'title': self.title,
58 |             'content': self.content,
59 |             'text': self.text,
60 |             'current_url': self.current_url,
61 |             'redirect_chain': self.redirect_chain,
62 |             'response_headers': self.response_headers,
63 |             'har_uuid': self.har_uuid,
64 |             'screenshot_id': self.screenshot_id,
65 |             'cookies': self.cookies,
66 |             'favicon_md5': self.favicon_md5,
67 |             'favicon_link': self.favicon_link,
68 |             'wappalyzer_results': self.wappalyzer_results,
69 |             'response_time': self.response_time,
70 |             'load_complete_time': self.load_complete_time,
71 |             'charset': self.charset,
72 |             'finished_at': self.finished_at.strftime("%Y-%m-%d %H:%M:%S")
73 |         }
74 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | 爬虫节点
 6 | """
 7 | 
 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Boolean, Integer, Float
 9 | 
10 | from webs.api.models import db
11 | 
12 | 
13 | class Server(db.Model):
14 |     __tablename__ = 'servers'
15 | 
16 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
17 |     server_name = Column(String(128))  # 爬虫节点服务器名称
18 |     server_address = Column(String(255), unique=True, nullable=True)  # 服务器地址 ip:port
19 |     enabled = Column(Boolean, server_default='t')  # 是否启用 默认是
20 |     status = Column(Boolean, server_default='t')  # 服务器状态是否正常 默认是
21 |     weight = Column(Integer, server_default='3')  # 服务器权重 1，2，3，4，5 默认为3
22 |     # load = Column(Integer, server_default='0')  # 服务器负载，子服务器定时向主节点发送
23 |     load = Column(String(20), server_default='0.1')  # 服务器负载，子服务器定时向主节点发送
24 |     spider_type = Column(String(20), server_default='splash')  # 爬虫节点类型 splash/pyppeteer
25 | 
26 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
27 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
28 | 
29 |     def __repr__(self):
30 |         return f'<Server-{self.server_name}-{self.server_address}>'
31 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | 动态爬虫扫描任务模型
 6 | """
 7 | from datetime import datetime
 8 | 
 9 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY, Boolean, Text
10 | from sqlalchemy.dialects.postgresql import JSONB
11 | 
12 | from webs.api.models import db
13 | 
14 | 
15 | class Task(db.Model):
16 |     __tablename__ = 'tasks'
17 | 
18 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
19 |     customer_id = Column(String(255), index=True)  # 纯粹用来作为API调用标识，API 返回时被原样返回，以方便 API 调用方匹配请求与返回。
20 |     task_name = Column(String(255))  # 任务名称
21 |     task_status = Column(String(255))  # task任务状态
22 |     finished = Column(Boolean, server_default='f')  # 任务是否已完成
23 |     schedule_options = Column(JSONB)  # 周期调度相关参数
24 |     crawl_options = Column(JSONB)  # 爬取相关参数
25 |     extra_data = Column(Text)  # 客户端额外数据
26 | 
27 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
28 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
29 | 
30 |     def __repr__(self):
31 |         return f'<Task-{self.id}-{self.task_name}>'
32 | 
33 |     def as_dict(self, **kwargs):
34 |         return {
35 |             'task_id': self.id,
36 |             'customer_id': self.customer_id,
37 |             'task_name': self.task_name,
38 |             'task_status': self.task_status,
39 |             'finished': self.finished,
40 |             'crawl_options': self.crawl_options,
41 |             'schedule_options': self.schedule_options,
42 |             'extra_data': self.extra_data,
43 |             'create_time': self.create_time.strftime("%Y-%m-%d %H:%M:%S"),
44 |             'update_time': self.update_time.strftime("%Y-%m-%d %H:%M:%S")
45 |         } if not kwargs.get('fields') else {
46 |             f: getattr(self, f, None) if not isinstance(getattr(self, f), datetime)
47 |             else getattr(self, f).strftime("%Y-%m-%d %H:%M:%S")
48 |             for f in kwargs['fields'] if f in self.__table__.columns
49 |         }
50 | 
51 | 
52 | class SubTask(db.Model):
53 |     __tablename__ = 'sub_tasks'
54 | 
55 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
56 |     schedule_task_id = Column(Integer, nullable=False, index=True)  # 所属某次调度任务id
57 |     server_id = Column(Integer, nullable=False)  # 此子任务关联的服务器节点
58 |     assigned_urls = Column(ARRAY(String))  # 此子任务所分配的url
59 |     delivery_failure_msg = Column(Text)  # 发送失败原因
60 |     finished = Column(Boolean, server_default='f')  # 是否已完成
61 |     finished_at = Column(TIMESTAMP)  # 完成时间
62 | 
63 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
64 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
65 | 
66 |     def __repr__(self):
67 |         return f'<SubTask-{self.id}>'
68 | 
69 | 
70 | class ScheduleTaskRecord(db.Model):
71 |     __tablename__ = 'schedule_task_records'
72 | 
73 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
74 |     task_id = Column(Integer, nullable=False, index=True)  # 所属任务id
75 |     schedule_task_status = Column(String(255))  # task任务状态
76 |     finished = Column(Boolean, server_default='f')  # 此次任务是否已完成
77 |     crawl_options = Column(JSONB)  # 此次任务所使用的爬取参数
78 | 
79 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)  # 调度任务创建时间
80 |     start_time = Column(TIMESTAMP)  # 此次调度任务真正开始执行时间
81 |     finished_time = Column(TIMESTAMP)  # 任务完成时间
82 | 
83 |     def __repr__(self):
84 |         return f'<ScheduleTask-{self.id}>'
85 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/task_url.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | 任务与url映射关系
 5 | """
 6 | 
 7 | from sqlalchemy import Column, BigInteger, TIMESTAMP, func
 8 | 
 9 | from webs.api.models import db
10 | 
11 | 
12 | class TaskUrl(db.Model):
13 |     __tablename__ = 'task_url'
14 | 
15 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
16 |     task_id = Column(BigInteger, nullable=True, index=True)
17 |     # sub_task_id = Column(BigInteger, index=True)  # 子任务id
18 |     url_id = Column(BigInteger, nullable=True, index=True)
19 | 
20 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
21 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
22 | 
23 |     def __repr__(self):
24 |         return f'<TaskUrl-{self.task_id}-{self.url_id}>'
25 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/models/url.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | Url模型
 6 | """
 7 | 
 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func
 9 | 
10 | from webs.api.models import db
11 | 
12 | 
13 | class Url(db.Model):
14 |     __tablename__ = 'urls'
15 | 
16 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
17 |     address = Column(String(1024), unique=True, index=True)
18 | 
19 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
20 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
21 | 
22 |     def __repr__(self):
23 |         return f'<Url-{self.address}>'
24 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from datetime import datetime
  4 | 
  5 | from webs.api.models.db_proxy import task_model_proxy
  6 | from webs.api.exceptions.customs import InvalidAPIRequest, RecordNotFound
  7 | from webs.api.utils.helper import nowstr, today
  8 | 
  9 | 
 10 | class LengthChecker(object):
 11 |     """字段长度校验"""
 12 | 
 13 |     def __init__(self, sign, length):
 14 |         self.sign = sign
 15 |         self.length = length
 16 | 
 17 |     def __call__(self, verified):
 18 |         if verified is not None and len(verified) > self.length:
 19 |             raise InvalidAPIRequest(f'{self.sign}长度过长！')
 20 | 
 21 | 
 22 | class OneOf(object):
 23 |     """Validator which succeeds if ``value`` is a member of ``choices``"""
 24 | 
 25 |     def __init__(self, choices):
 26 |         self.choices = choices
 27 | 
 28 |     def __call__(self, verified):
 29 |         if verified not in self.choices:
 30 |             raise InvalidAPIRequest(f'请选择{self.choices}其中之一！')
 31 | 
 32 | 
 33 | class TaskValidator(object):
 34 |     """主任务验证器"""
 35 | 
 36 |     def __init__(self):
 37 |         self.url_pattern = r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
 38 | 
 39 |     def url_valid(self, url):
 40 |         if not re.match(self.url_pattern, url):
 41 |             raise InvalidAPIRequest(f'{url}不是一个合法的Url！')
 42 | 
 43 |     def task_id_exist_valid(self, task_id):
 44 |         if not task_model_proxy.find(id=task_id):
 45 |             raise RecordNotFound('该任务不存在！')
 46 | 
 47 |     @staticmethod
 48 |     def schedule_valid(kwargs):
 49 |         """验证周期性调度相关参数"""
 50 |         schedule_type, schedule_data = kwargs.get('schedule_type'), kwargs.get('schedule_data')
 51 | 
 52 |         # 验证定时执行
 53 |         if schedule_type == 'datetime':
 54 |             if len(schedule_data) > 1:
 55 |                 raise InvalidAPIRequest('无效的执行时间！')
 56 |             run_date = schedule_data.get('run_date')
 57 |             if not run_date:
 58 |                 raise InvalidAPIRequest('无效的执行时间！')
 59 |             # 和当前时间比较
 60 |             if run_date <= nowstr():
 61 |                 raise InvalidAPIRequest('执行时间不能小于当前时间！')
 62 | 
 63 |         # 验证间隔执行和周期调度
 64 |         elif schedule_type in ('interval', 'cron'):
 65 |             if not schedule_data.get('start_date') or not schedule_data.get('end_date'):
 66 |                 raise InvalidAPIRequest('请输入开始时间和结束时间！')
 67 | 
 68 |             interval_effective_params = {
 69 |                 'weeks', 'days', 'hours', 'minutes', 'seconds',
 70 |                 'start_date', 'end_date', 'max_instances'
 71 |             }
 72 |             cron_effective_params = {
 73 |                 'week', 'day', 'hour', 'minute',
 74 |                 'second', 'year', 'month',
 75 |                 'day_of_week', 'max_instances',
 76 |                 'start_date', 'end_date'
 77 |             }
 78 | 
 79 |             if (schedule_type == 'cron' and set(schedule_data.keys()).difference(cron_effective_params)) or (
 80 |                     schedule_type == 'interval' and set(schedule_data.keys()).difference(interval_effective_params)):
 81 |                 raise InvalidAPIRequest('无效的调度参数！')
 82 | 
 83 |             if not set(schedule_data.keys()).difference({'start_date', 'end_date'}):
 84 |                 raise InvalidAPIRequest('请输入正确的调度参数！')
 85 | 
 86 |             if schedule_data.get('start_date') >= schedule_data.get('end_date'):
 87 |                 raise InvalidAPIRequest('开始时间不能大于结束时间！')
 88 | 
 89 |             if schedule_data.get('end_date') < today():
 90 |                 raise InvalidAPIRequest('结束时间不能小于当前时间！')
 91 | 
 92 | 
 93 | class TimeValidator(object):
 94 | 
 95 |     def __init__(self, s=None, e=None):
 96 |         self.s = s
 97 |         self.e = e
 98 | 
 99 |     @staticmethod
100 |     def date_or_datetime_valid(_time):
101 |         try:
102 |             datetime.strptime(_time, "%Y-%m-%d")
103 |             return
104 |         except (ValueError, AttributeError) as e:
105 |             pass
106 |         try:
107 |             datetime.strptime(_time, "%Y-%m-%d %H:%M:%S")
108 |             return
109 |         except (ValueError, AttributeError) as e:
110 |             pass
111 |         raise InvalidAPIRequest('请输入正确的日期时间！')
112 | 
113 |     def __call__(self, time_field):
114 |         if not self.s <= time_field <= self.e:
115 |             raise InvalidAPIRequest('请输入正确的时间范围！')
116 | 
117 | 
118 | task_validator = TaskValidator()
119 | time_validator = TimeValidator()
120 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/results.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from webargs import fields
 3 | 
 4 | result_by_url_schema = {
 5 |     "url": fields.Url(required=True),
 6 |     "fields": fields.DelimitedList(fields.Str(), missing=[])
 7 | }
 8 | 
 9 | result_by_id_schema = {
10 |     "result_id": fields.Int(required=True)
11 | }
12 | 
13 | get_screenshot_schema = {
14 |     'screenshot_id': fields.Str(required=True)
15 | }
16 | 
17 | download_har_file_schema = {
18 |     'har_uuid': fields.Str(required=True)
19 | }
20 | 
21 | get_favicon_schema = {
22 |     'favicon_md5': fields.Str(required=True)
23 | }
24 | 
25 | get_small_schema = {
26 |     **get_screenshot_schema,
27 |     'wide': fields.Int(missing=272),
28 |     'high': fields.Int(missing=165)
29 | }
30 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/schemas/tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | from webargs import fields
 5 | 
 6 | from webs.api.schemas import LengthChecker, OneOf, task_validator, TimeValidator as TimeRangeValid, time_validator
 7 | 
 8 | schedule_data = {
 9 |     'run_date': fields.Str(validate=time_validator.date_or_datetime_valid),  # 定时执行时间
10 |     'year': fields.Int(validate=TimeRangeValid(2021, 2999)),
11 |     'month': fields.Int(validate=TimeRangeValid(1, 12)),
12 |     'day': fields.Int(validate=TimeRangeValid(1, 31)),
13 |     'week': fields.Int(validate=TimeRangeValid(1, 53)),
14 |     'day_of_week': fields.Int(validate=TimeRangeValid(0, 6)),
15 |     'hour': fields.Int(validate=TimeRangeValid(0, 23)),
16 |     'minute': fields.Int(validate=TimeRangeValid(0, 59)),
17 |     'second': fields.Int(validate=TimeRangeValid(0, 59)),
18 |     'weeks': fields.Int(),
19 |     'days': fields.Int(),
20 |     'hours': fields.Int(),
21 |     'minutes': fields.Int(),
22 |     'seconds': fields.Int(),
23 |     'start_date': fields.Str(validate=time_validator.date_or_datetime_valid),
24 |     'end_date': fields.Str(validate=time_validator.date_or_datetime_valid),
25 |     'max_instances': fields.Int(missing=1)
26 | }
27 | 
28 | crawl_options = {
29 |     'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])),
30 |     'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])),  # 任务优先级
31 |     'headless': fields.Bool(missing=False),  # 有头/无头模式 默认使用有头模式
32 |     'debug': fields.Bool(missing=False),  # 是否开启调试模式,
33 |     'referer': fields.Str(),  # 网站来路地址
34 |     'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])),  # 并发数
35 |     'url_timeout': fields.Int(missing=30),  # 单个url超时时间
36 |     'enabled_render_js': fields.Bool(missing=True),
37 |     'page_wait_time': fields.Int(missing=3),  # 等待页面js渲染时间
38 |     'ignore_ssl': fields.Bool(missing=True),  # 是否忽略证书错误
39 |     'screenshot': fields.Bool(missing=False),  # 是否截图
40 |     'proxy_url': fields.Str(),  # 代理
41 |     'user_agent': fields.Str(),  # Ua
42 |     'record_har': fields.Bool(missing=False),  # 请求networks
43 |     'record_redirect': fields.Bool(missing=False),  # 是否记录重定向链接
44 |     'use_browser_cache': fields.Bool(missing=True),  # 是否使用浏览器缓存
45 |     'use_result_cache': fields.Bool(missing=True),  # 是否使用结果缓存
46 |     'wappalyzer': fields.Bool(missing=False),  # 是否使用指纹识别
47 |     'extract_text': fields.Bool(missing=True),  # 是否提取网页正文
48 |     'extract_favicon': fields.Bool(missing=False),  # 是否下载网站图标
49 |     'callback_type': fields.Str(validate=OneOf(choices=['http', 'rpc'])),
50 |     'callback_address': fields.Str(),
51 |     'wait_until': fields.Str(
52 |         missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])),  # 控制页面何时加载成功，
53 |     'rpc_server': fields.Str(missing=os.getenv('LOCAL_RPC_SERVER_ADDRESS'))
54 | }
55 | 
56 | create_task_schema = {
57 |     'customer_id': fields.Str(validate=LengthChecker(sign='自定义id', length=255)),
58 |     'task_name': fields.Str(validate=LengthChecker(sign='任务名称', length=255)),
59 |     'urls': fields.DelimitedList(fields.Str(validate=task_validator.url_valid), required=True),
60 |     'schedule_type': fields.Str(missing='instantly', validate=OneOf(['instantly', 'datetime', 'interval', 'cron'])),
61 |     'schedule_data': fields.Nested(schedule_data, missing={}),
62 |     'crawl_options': fields.Nested(crawl_options, missing={}),
63 |     'extra_data': fields.Str(),
64 | }
65 | 
66 | task_id_schema = {
67 |     'task_id': fields.Int(required=True, validate=task_validator.task_id_exist_valid)
68 | }
69 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | def now():
 7 |     return datetime.now()
 8 | 
 9 | 
10 | def nowstr():
11 |     return now().strftime('%Y-%m-%d %H:%M:%S')
12 | 
13 | 
14 | def today():
15 |     return now().strftime('%Y-%m-%d')
16 | 
17 | 
18 | def add_spider_server(address):
19 |     """添加爬虫服务地址"""
20 |     from webs.api.models.db_proxy import server_model_proxy
21 |     server_model_proxy.add_server(address)
22 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/loggers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import logging
  5 | import socket
  6 | import sys
  7 | import traceback
  8 | from datetime import datetime
  9 | 
 10 | try:
 11 |     import simplejson as json
 12 | except ImportError:
 13 |     import json
 14 | 
 15 | 
 16 | class JSONFormatter(logging.Formatter):
 17 |     """
 18 |     JSON formatter for python logging
 19 | 
 20 |     You can pass additional tags on a per message basis using the
 21 |     key "tags" in the extra parameter.
 22 |     eg: logger.error('hello world!', extra={"tags": ["hello=world"]})
 23 |     """
 24 | 
 25 |     def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON',
 26 |                  indent=None):
 27 |         """
 28 |         :param tags: a list of tags to add to every messages
 29 |         :hostname: force a specific hostname
 30 |         :fqdn: a boolean to use the FQDN instead of the machine's hostname
 31 |         :message_type: the message type for Logstash formatters
 32 |         :indent: indent level of the JSON output
 33 |         """
 34 |         self.message_type = message_type
 35 |         self.tags = tags if tags is not None else []
 36 |         self.extra_tags = []
 37 |         self.indent = indent
 38 | 
 39 |         if hostname:
 40 |             self.host = hostname
 41 |         elif fqdn:
 42 |             self.host = socket.getfqdn()
 43 |         else:
 44 |             self.host = socket.gethostname()
 45 | 
 46 |     def get_extra_fields(self, record):
 47 |         # The list contains all the attributes listed in
 48 |         # http://docs.python.org/library/logging.html#logrecord-attributes
 49 |         skip_list = [
 50 |             'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args',
 51 |             'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg',
 52 |             'msecs', 'msecs', 'message', 'name', 'pathname', 'process',
 53 |             'processName', 'relativeCreated', 'thread', 'threadName', 'extra']
 54 | 
 55 |         if sys.version_info < (3, 0):
 56 |             easy_types = (str, bool, dict, float, int, list, type(None))
 57 |         else:
 58 |             easy_types = (str, bool, dict, float, int, list, type(None))
 59 | 
 60 |         fields = {}
 61 | 
 62 |         self.extra_tags = []
 63 |         for key, value in record.__dict__.items():
 64 |             if key not in skip_list:
 65 |                 if key == 'tags' and isinstance(value, list):
 66 |                     self.extra_tags = value
 67 |                 elif isinstance(value, easy_types):
 68 |                     fields[key] = value if value else "null"
 69 |                 else:
 70 |                     fields[key] = repr(value)
 71 | 
 72 |         return fields
 73 | 
 74 |     def get_debug_fields(self, record):
 75 |         if record.exc_info:
 76 |             exc_info = self.format_exception(record.exc_info)
 77 |         else:
 78 |             exc_info = record.exc_text
 79 |         return {
 80 |             'exc_info': exc_info,
 81 |             'filename': record.filename,
 82 |             'lineno': record.lineno,
 83 |         }
 84 | 
 85 |     @classmethod
 86 |     def format_source(cls, message_type, host, path):
 87 |         return "%s://%s/%s" % (message_type, host, path)
 88 | 
 89 |     @classmethod
 90 |     def format_timestamp(cls, time):
 91 |         return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X"))
 92 | 
 93 |     @classmethod
 94 |     def format_exception(cls, exc_info):
 95 |         return ''.join(traceback.format_exception(*exc_info)) if exc_info else ''
 96 | 
 97 |     @classmethod
 98 |     def serialize(cls, message, indent=None):
 99 |         return json.dumps(message, ensure_ascii=False, indent=indent)
100 | 
101 |     def format(self, record, serialize=True):
102 |         old_message = record.getMessage()
103 |         try:
104 |             new_message = json.loads(old_message)
105 |         except json.decoder.JSONDecodeError as e:
106 |             message = old_message.replace("'", '"')
107 |             new_message = json.loads(message)
108 |         except Exception:
109 |             new_message = record.getMessage()
110 |         # Create message dict
111 |         message = {
112 |             'timestamp': self.format_timestamp(record.created),
113 |             'app': os.environ.get('APP_NAME'),
114 |             'host': self.host,
115 |             'environment': os.environ.get('FLASK_ENV'),
116 |             'logger': record.name,
117 |             'level': record.levelname,
118 |             'messages': new_message,
119 |             'path': record.pathname,
120 |             'tags': self.tags[:]
121 |         }
122 | 
123 |         # Add extra fields
124 |         message.update(self.get_extra_fields(record))
125 | 
126 |         # Add extra tags
127 |         if self.extra_tags:
128 |             message['tags'].extend(self.extra_tags)
129 | 
130 |         # If exception, add debug info
131 |         if record.exc_info or record.exc_text:
132 |             message.update(self.get_debug_fields(record))
133 | 
134 |         if serialize:
135 |             return self.serialize(message, indent=self.indent)
136 |         return message
137 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from datetime import datetime
  4 | from flask import current_app, request
  5 | from sqlalchemy.exc import DatabaseError
  6 | from webs.api.exceptions.customs import InvalidContentType
  7 | from webs.api.models import db
  8 | 
  9 | ACL_ORIGIN = 'Access-Control-Allow-Origin'
 10 | ACL_METHODS = 'Access-Control-Allow-Methods'
 11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers'
 12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials'
 13 | ACL_CACHE_CONTROL = 'Cache-Control'
 14 | 
 15 | GET_METHOD = 'GET'
 16 | OPTIONS_METHOD = 'OPTIONS'
 17 | ALLOWED_ORIGINS = '*'
 18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
 19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \
 20 |                   'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type'
 21 | ALLOWED_CREDENTIALS = 'true'  # Allow send cookie
 22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate'
 23 | 
 24 | 
 25 | def before_request_middleware(app):
 26 |     app.before_request_funcs.setdefault(None, [
 27 |         ensure_request_log,
 28 |         ensure_content_type,
 29 |     ])
 30 | 
 31 | 
 32 | def after_request_middleware(app):
 33 |     app.after_request_funcs.setdefault(None, [
 34 |         enable_cors,
 35 |         commit_session,
 36 |     ])
 37 | 
 38 | 
 39 | def teardown_appcontext_middleware(app):
 40 |     app.teardown_appcontext_funcs = [
 41 |         shutdown_session,
 42 |     ]
 43 | 
 44 | 
 45 | def ensure_request_log():
 46 |     """当为生产环境时，屏蔽中间件日志记录器"""
 47 |     if current_app.debug:
 48 |         current_app.logger.info(
 49 |             "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || "
 50 |             "Parameters: {param}".format(
 51 |                 time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 52 |                 client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr),
 53 |                 path=request.full_path,
 54 |                 param=request.data.decode('utf-8')))
 55 | 
 56 | 
 57 | def ensure_content_type():
 58 |     """
 59 |     Ensures that the Content-Type for all requests
 60 |     is `application-json` or `multipart/form-data`, otherwise appropriate error
 61 |     is raised.
 62 |     :raises: InvalidContentType if Content-Type is not `application-json`
 63 |     or not `multipart/form-data`
 64 |     """
 65 | 
 66 |     content_type = request.headers.get('Content-Type')
 67 |     if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \
 68 |             (not content_type or not ('application/json' in content_type or
 69 |                                       'multipart/form-data' in content_type)):
 70 |         raise InvalidContentType(
 71 |             message='Invalid Content-Type. '
 72 |                     'Only `application/json` or `multipart/form-data` is allowed')
 73 | 
 74 | 
 75 | def enable_cors(response):
 76 |     """
 77 |     Enable Cross-origin resource sharing.
 78 |     These headers are needed for the clients that
 79 |     will consume the API via AJAX requests.
 80 |     """
 81 |     if request.method == OPTIONS_METHOD:
 82 |         response = current_app.make_default_options_response()
 83 |     response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS
 84 |     response.headers[ACL_METHODS] = ALLOWED_METHODS
 85 |     response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS
 86 |     response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL
 87 | 
 88 |     return response
 89 | 
 90 | 
 91 | def commit_session(response):
 92 |     """
 93 |     Try to commit the db session in the case
 94 |     of a successful request with status_code
 95 |     under 400.
 96 |     """
 97 |     if response.status_code >= 400:
 98 |         return response
 99 |     try:
100 |         db.session.commit()
101 |     except DatabaseError:
102 |         db.session.rollback()
103 |     return response
104 | 
105 | 
106 | def shutdown_session(exception=None):
107 |     """
108 |     Remove the db session and detach from the
109 |     database driver after application shutdown.
110 |     """
111 |     db.session.remove()
112 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/routers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pkgutil
 4 | 
 5 | 
 6 | def register_routes(app):
 7 |     """Register routes."""
 8 |     from .. import views
 9 |     from flask.blueprints import Blueprint
10 | 
11 |     for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."):
12 |         blueprint_name = name.split('.')[-1]
13 |         modules = __import__(name, fromlist="dummy")
14 |         blueprint = getattr(modules, blueprint_name)
15 |         if isinstance(blueprint, Blueprint):
16 |             app.register_blueprint(blueprint)
17 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/utils/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask_migrate import Migrate
 4 | 
 5 | from webs.api.models import db, redis_store
 6 | 
 7 | 
 8 | def init_db(app):
 9 |     """
10 |     Create database if doesn't exist and
11 |     create all tables.
12 |     """
13 | 
14 |     # 初始化pg
15 |     db.init_app(app)
16 |     migrate = Migrate(compare_type=True, compare_server_default=True)
17 |     migrate.init_app(app, db)
18 | 
19 |     # 初始化Redis
20 |     redis_store.init_app(app)
21 | 
22 |     return db
23 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/views/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/views/ping.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask import Blueprint, jsonify
 4 | 
 5 | ping = Blueprint('ping', __name__)
 6 | 
 7 | 
 8 | @ping.route('/ping')
 9 | def ping_pong():
10 |     """
11 |     测试服务是否可用
12 |     """
13 |     return jsonify({
14 |         "data": "pong",
15 |         "status": True
16 |     })
17 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/views/results.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from flask import Blueprint, jsonify
  4 | from webargs.flaskparser import use_args
  5 | 
  6 | from webs.api.bizs.result import ResultBiz
  7 | from webs.api.schemas.results import result_by_url_schema, result_by_id_schema, get_screenshot_schema, \
  8 |     download_har_file_schema, get_favicon_schema, get_small_schema
  9 | 
 10 | results = Blueprint('results', __name__, url_prefix='/results')
 11 | 
 12 | 
 13 | @results.route('/get-by-url')
 14 | @use_args(result_by_url_schema, locations=('query',))
 15 | def result_by_url(args):
 16 |     """
 17 |     根据url查询结果
 18 |     :param args:
 19 |     :return:
 20 |     """
 21 | 
 22 |     result_biz = ResultBiz()
 23 |     return jsonify({
 24 |         'status': True,
 25 |         'data': result_biz.result_by_url(args['url'], args['fields'])
 26 |     }), 200
 27 | 
 28 | 
 29 | @results.route('/get-by-id')
 30 | @use_args(result_by_id_schema, locations=('query',))
 31 | def result_by_id(args):
 32 |     """
 33 |     根据id查询结果
 34 |     :param args:
 35 |     :return:
 36 |     """
 37 | 
 38 |     result_biz = ResultBiz()
 39 |     return jsonify({
 40 |         'status': True,
 41 |         'data': result_biz.result_by_id(args['result_id'])
 42 |     }), 200
 43 | 
 44 | 
 45 | @results.route('/screenshot')
 46 | @use_args(get_screenshot_schema, locations=('query',))
 47 | def get_screenshot(args):
 48 |     """
 49 |     获取图片
 50 |     :param args:
 51 |     :return:
 52 |     """
 53 | 
 54 |     result_biz = ResultBiz()
 55 |     return result_biz.get_screenshot(args['screenshot_id'])
 56 | 
 57 | 
 58 | @results.route('/screenshot/encode')
 59 | @use_args(get_screenshot_schema, locations=('query',))
 60 | def get_screenshot_base64(args):
 61 |     """
 62 |     获取图片base64编码
 63 |     :param args:
 64 |     :return:
 65 |     """
 66 | 
 67 |     result_biz = ResultBiz()
 68 |     return jsonify({
 69 |         'status': True,
 70 |         'data': result_biz.get_screenshot_base64_encode(args['screenshot_id'])
 71 |     }), 200
 72 | 
 73 | 
 74 | @results.route('/screenshot/download')
 75 | @use_args(get_screenshot_schema, locations=('query',))
 76 | def download_screenshot(args):
 77 |     """
 78 |     下载图片
 79 |     :param args:
 80 |     :return:
 81 |     """
 82 |     result_biz = ResultBiz()
 83 |     return result_biz.get_screenshot(args['screenshot_id'], download=True)
 84 | 
 85 | 
 86 | @results.route('/screenshot/small')
 87 | @use_args(get_small_schema, locations=('query',))
 88 | def small_screenshot(args):
 89 |     """
 90 |     查看图片缩略图
 91 |     :param args:
 92 |     :return:
 93 |     """
 94 |     result_biz = ResultBiz()
 95 |     return result_biz.get_small_screenshot(**args)
 96 | 
 97 | 
 98 | @results.route('/har/download')
 99 | @use_args(download_har_file_schema, locations=('query',))
100 | def download_har_file(args):
101 |     """
102 |     下载har文件
103 |     :param args:
104 |     :return:
105 |     """
106 | 
107 |     result_biz = ResultBiz()
108 |     return result_biz.download_har(args['har_uuid'])
109 | 
110 | 
111 | @results.route('/favicon')
112 | @use_args(get_favicon_schema, locations=('query',))
113 | def get_favicon(args):
114 |     """
115 |     查看网站图标
116 |     :param args:
117 |     :return:
118 |     """
119 |     result_biz = ResultBiz()
120 |     return result_biz.get_favicon(args['favicon_md5'])
121 | 
122 | 
123 | @results.route('/favicon/download')
124 | @use_args(get_favicon_schema, locations=('query',))
125 | def download_favicon(args):
126 |     """
127 |     下载网站图标
128 |     :param args:
129 |     :return:
130 |     """
131 |     result_biz = ResultBiz()
132 |     return result_biz.get_favicon(args['favicon_md5'], download=True)
133 | 


--------------------------------------------------------------------------------
/services/engine/webs/api/views/tasks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from flask import Blueprint, jsonify
  4 | from webargs.flaskparser import use_args
  5 | 
  6 | from webs.api.bizs.task import TaskBiz
  7 | from webs.api.schemas import task_validator
  8 | from webs.api.schemas.tasks import create_task_schema, task_id_schema
  9 | 
 10 | tasks = Blueprint('tasks', __name__, url_prefix='/tasks')
 11 | 
 12 | 
 13 | @tasks.route('', methods=['POST'])
 14 | @use_args(create_task_schema, locations=('json',), validate=task_validator.schedule_valid)
 15 | def create_task(args):
 16 |     """
 17 |     创建爬虫任务
 18 |     :param args:
 19 |     :return:
 20 |     """
 21 | 
 22 |     task_biz = TaskBiz()
 23 |     data = task_biz.create_task(**args)
 24 | 
 25 |     return jsonify({
 26 |         'status': True,
 27 |         'data': data
 28 |     }), 201
 29 | 
 30 | 
 31 | @tasks.route('', methods=['DELETE'])
 32 | @use_args(task_id_schema, locations=('json',))
 33 | def delete_task(args):
 34 |     """
 35 |     删除爬虫任务
 36 |     :param args:
 37 |     :return:
 38 |     """
 39 | 
 40 |     task_biz = TaskBiz()
 41 |     task_biz.delete_task(args['task_id'])
 42 | 
 43 |     return jsonify({
 44 |         'status': True
 45 |     }), 204
 46 | 
 47 | 
 48 | @tasks.route('/status')
 49 | @use_args(task_id_schema, locations=('query',))
 50 | def get_task_status(args):
 51 |     """
 52 |     查询任务进度
 53 |     :param args:
 54 |     :return:
 55 |     """
 56 | 
 57 |     task_biz = TaskBiz()
 58 |     return jsonify({
 59 |         'status': True,
 60 |         'data': task_biz.get_task_status(**args)
 61 |     }), 200
 62 | 
 63 | 
 64 | @tasks.route('/pause', methods=['PATCH'])
 65 | @use_args(task_id_schema, locations=('json',))
 66 | def pause_task(args):
 67 |     """
 68 |     暂停调度任务
 69 |     :param args:
 70 |     :return:
 71 |     """
 72 | 
 73 |     task_biz = TaskBiz()
 74 |     task_biz.pause_task(args['task_id'])
 75 | 
 76 |     return jsonify({
 77 |         'status': True,
 78 |     }), 200
 79 | 
 80 | 
 81 | @tasks.route('/resume', methods=['PATCH'])
 82 | @use_args(task_id_schema, locations=('json',))
 83 | def resume_task(args):
 84 |     """
 85 |     恢复调度任务
 86 |     :param args:
 87 |     :return:
 88 |     """
 89 | 
 90 |     task_biz = TaskBiz()
 91 |     task_biz.resume_task(args['task_id'])
 92 | 
 93 |     return jsonify({
 94 |         'status': True,
 95 |     }), 200
 96 | 
 97 | 
 98 | @tasks.route('/redelivery', methods=['POST'])
 99 | @use_args(task_id_schema, locations=('json',))
100 | def redelivery(args):
101 |     """
102 |     重新下发
103 |     """
104 | 
105 |     task_biz = TaskBiz()
106 |     task_biz.redelivery(args['task_id'])
107 |     return jsonify({'status': True}), 200
108 | 


--------------------------------------------------------------------------------
/services/engine/webs/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | class BaseConfig:
 8 |     """Base configuration"""
 9 | 
10 |     # Root path of project
11 |     PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12 | 
13 |     DEBUG = True
14 |     TESTING = False
15 |     SQLALCHEMY_TRACK_MODIFICATIONS = False
16 |     SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True}
17 |     SECRET_KEY = os.environ.get('SECRET_KEY')
18 | 
19 |     # Redis configuration
20 |     REDIS_URL = os.environ.get('REDIS_URL')
21 | 
22 | 
23 | class DevelopmentConfig(BaseConfig):
24 |     """Development configuration"""
25 | 
26 |     SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
27 | 
28 | 
29 | class ProductionConfig(BaseConfig):
30 |     """Production configuration"""
31 | 
32 |     DEBUG = False
33 |     SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
34 | 


--------------------------------------------------------------------------------
/services/engine/webs/core/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/webs/core/requests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .request import web_client
4 | 


--------------------------------------------------------------------------------
/services/engine/webs/core/requests/request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | import requests
 5 | 
 6 | from webs.api.exceptions.customs import ServerError, InvalidAPIRequest, RecordNotFound, RecordAlreadyExists
 7 | 
 8 | 
 9 | class RequestMixin(object):
10 |     CODE_EXCEPTION_MSG = {
11 |         400: InvalidAPIRequest,
12 |         404: RecordNotFound,
13 |         409: RecordAlreadyExists,
14 |         422: InvalidAPIRequest,
15 |         500: ServerError,
16 |     }
17 | 
18 |     def __init__(self):
19 |         self.session = requests.Session()
20 | 
21 |     @property
22 |     def _headers(self):
23 |         return {
24 |             "Content-Type": "application/json",
25 |         }
26 | 
27 |     def request(self, server, method, url, json=None, params=None, timeout=60):
28 |         try:
29 |             response = self.session.request(
30 |                 method, url, json=json, params=params,
31 |                 timeout=timeout, headers=self._headers
32 |             )
33 |         except requests.exceptions.ConnectTimeout:
34 |             raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接超时！")
35 |         except requests.exceptions.ConnectionError:
36 |             raise self.CODE_EXCEPTION_MSG[500](f"{server}服务器连接错误！")
37 | 
38 |         try:
39 |             response_data = response.json()
40 |         except Exception as e:
41 |             raise ServerError(f"{server}服务器参数解析失败！")
42 | 
43 |         if not (200 <= response.status_code < 300):
44 |             exception = self.CODE_EXCEPTION_MSG[response.status_code] \
45 |                 if response.status_code in self.CODE_EXCEPTION_MSG else self.CODE_EXCEPTION_MSG[400]
46 |             raise exception(f"{server} Response:{response_data.get('error').get('message')}")
47 | 
48 |         return response_data
49 | 
50 | 
51 | web_client = RequestMixin()
52 | 


--------------------------------------------------------------------------------
/services/engine/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | from celery import Celery
 5 | 
 6 | ##################
 7 | # Celery配置
 8 | from webs import create_app
 9 | 
10 | 
11 | class CeleryConfig(object):
12 |     # 任务与劣化为json，从Celery4.0开始，默认序列化器将为json
13 |     task_serializer = 'json'
14 | 
15 |     # 结果序列化为json
16 |     result_serializer = 'json'
17 | 
18 |     # 定时任务过期时间
19 |     result_expires = 60 * 60 * 24
20 | 
21 |     # 允许接收的任务类型
22 |     accept_content = ["json"]
23 | 
24 |     # 每个进程预取任务数
25 |     worker_prefetch_multiplier = 1
26 | 
27 |     # 每个worker执行200个任务就销毁重启
28 |     worker_max_tasks_per_child = 200
29 | 
30 |     # 时区设置
31 |     timezone = 'Asia/Shanghai'
32 |     enable_utc = True
33 | 
34 | 
35 | ##################
36 | # 初始化celery worker
37 | def init_celery(app=None, celery_type='usual'):
38 |     app = app or create_app()
39 |     celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL'))
40 |     celery_app.config_from_object(CeleryConfig)
41 | 
42 |     # 导入相关任务模块
43 |     if celery_type == 'usual':
44 |         celery_app.conf.update(imports=['worker.engine', 'worker.result'])
45 |     elif celery_type == 'beat':
46 |         pass
47 |         # celery_app.conf.update(
48 |         #     imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron'])
49 |         # celery_app.conf.update(
50 |         #     CELERYBEAT_SCHEDULE={
51 |         #     }
52 |         # )
53 | 
54 |     # 在flask上下文中执行
55 |     class ContextTask(celery_app.Task):
56 |         """Make celery tasks work with Flask app context"""
57 | 
58 |         def __call__(self, *args, **kwargs):
59 |             with app.app_context():
60 |                 return self.run(*args, **kwargs)
61 | 
62 |     celery_app.Task = ContextTask
63 |     return celery_app
64 | 
65 | 
66 | celery_app = init_celery()
67 | # beat_app = init_celery(celery_type='beat')
68 | 


--------------------------------------------------------------------------------
/services/engine/worker/engine.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from webs.api.exceptions.customs import RecordNotFound
  4 | from webs.api.models.db_proxy import server_model_proxy, task_model_proxy, subtask_model_proxy, url_model_proxy, \
  5 |     schedule_task_proxy, task_url_model_proxy, result_model_proxy
  6 | from worker import celery_app
  7 | from worker.library.helper import split_urls, send, WeightedRandomGenerator, remove_files
  8 | 
  9 | 
 10 | @celery_app.task(name='delivery_task')
 11 | def delivery_task(task_id):
 12 |     """
 13 |     下发任务
 14 |     :param task_id:
 15 |     :return:
 16 |     """
 17 | 
 18 |     task_obj = task_model_proxy.find(id=task_id)
 19 |     if not task_obj:
 20 |         return
 21 | 
 22 |     # 增加爬虫任务调度记录
 23 |     schedule_task_obj = task_model_proxy.add_schedule_record(
 24 |         task_id=task_id, schedule_task_status="start_delivery", crawl_options=task_obj.crawl_options)
 25 |     schedule_task_id = schedule_task_obj.id
 26 | 
 27 |     # 查询待爬取的url struct列表
 28 |     urls_struct = task_url_model_proxy.query_urls_by_task_id(task_id)
 29 | 
 30 |     # url分块处理
 31 |     chunk_urls_struct = split_urls(urls_struct)
 32 | 
 33 |     # 获取爬虫节点服务器配额
 34 |     servers_info = server_model_proxy.query_servers_by_score(sort='desc')
 35 |     if not servers_info:
 36 |         task_model_proxy.set_many_attr(obj_id=task_id, fields_v={'task_status': 'No server found！', 'finished': True})
 37 |         raise RecordNotFound('No server found！')
 38 | 
 39 |     ###### 投递子任务
 40 |     # 当url分块数小于服务器节点数时采用轮询算法
 41 |     if len(chunk_urls_struct) <= len(servers_info):
 42 |         for index, chunk_url_struct in enumerate(chunk_urls_struct):
 43 |             send(schedule_task_id, chunk_url_struct, servers_info[index], task_obj.crawl_options)
 44 | 
 45 |     # 否则使用加权随机算法
 46 |     else:
 47 |         server_seeds = WeightedRandomGenerator(servers_info)
 48 |         for chunk_url_struct in chunk_urls_struct:
 49 |             send(schedule_task_id, chunk_url_struct, server_seeds.spawn(), task_obj.crawl_options)
 50 | 
 51 |     ###### 根据子任务发送情况设置主任务状态
 52 |     # 查询子任务投递失败数
 53 |     failure_count = subtask_model_proxy.query_delivery_failure_count(schedule_task_id)
 54 |     # 如果子任务投递全部失败，则设置当前调度任务为投递失败状态
 55 |     if failure_count == len(chunk_urls_struct):
 56 |         schedule_task_proxy.set_many_attr(
 57 |             obj_id=schedule_task_id, fields_v={'schedule_task_status': 'delivery_failure', 'finished': True}
 58 |         )
 59 |         # 如果是临时任务，则直接标记主任务为失败状态
 60 |         if task_obj.schedule_options.get('schedule_type') == 'instantly':
 61 |             task_model_proxy.set_many_attr(
 62 |                 obj=task_obj, fields_v={'task_status': 'delivery_failure', 'finished': True}
 63 |             )
 64 | 
 65 |     # 只要有一个投递失败，则标记为部分失败
 66 |     elif failure_count != 0:
 67 |         schedule_task_proxy.set_attr_by_id(
 68 |             obj_id=schedule_task_id, field='schedule_task_status', value='part_delivery_failure')
 69 | 
 70 |     # 否则标记全部投递成功
 71 |     else:
 72 |         schedule_task_proxy.set_attr_by_id(
 73 |             obj_id=schedule_task_id, field='schedule_task_status', value='delivery_success')
 74 | 
 75 | 
 76 | @celery_app.task(name='delete_task')
 77 | def delete_task(task_id):
 78 |     """
 79 |     删除任务，因需要删除截图和har文件 故使用异步方式进行删除
 80 |     :param task_id:
 81 |     :return:
 82 |     """
 83 | 
 84 |     # 查询所有schedule task
 85 |     schedule_task_subquery = schedule_task_proxy.db_session.query(schedule_task_proxy.model.id) \
 86 |         .filter(schedule_task_proxy.model.task_id == task_id).subquery()
 87 | 
 88 |     # 查询所有subtask
 89 |     subtask_subquery = subtask_model_proxy.db_session.query(subtask_model_proxy.model.id).filter(
 90 |         subtask_model_proxy.model.schedule_task_id.in_(schedule_task_subquery)).subquery()
 91 | 
 92 |     ###### 删除结果
 93 |     result_query = result_model_proxy.self_session.filter(
 94 |         result_model_proxy.model.subtask_id.in_(subtask_subquery))
 95 | 
 96 |     # 删除截图
 97 |     screenshot_ids = [each.screenshot_id + '.png' for each in
 98 |                       result_query.filter(result_model_proxy.model.screenshot_id.isnot(None)).all()]
 99 |     remove_files(path='screenshots', file_ids=screenshot_ids)
100 | 
101 |     # 删除hars
102 |     har_ids = [each.har_uuid + '.json' for each in
103 |                result_query.filter(result_model_proxy.model.har_uuid.isnot(None)).all()]
104 |     remove_files(path='hars', file_ids=har_ids)
105 | 
106 |     # 删除结果
107 |     result_query.delete(synchronize_session=False)
108 |     result_model_proxy.safe_commit()
109 | 
110 |     # 删除Schedule task
111 |     schedule_task_proxy.delete_models(ids=schedule_task_subquery, fields='id')
112 | 
113 |     # 删除subtask
114 |     subtask_model_proxy.delete_models(ids=subtask_subquery, fields='id')
115 | 
116 |     # 删除task_url
117 |     task_url_model_proxy.delete_model(task_id=task_id)
118 | 


--------------------------------------------------------------------------------
/services/engine/worker/library/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/engine/worker/library/favicon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import requests
 5 | 
 6 | from urllib.parse import urljoin, urlparse
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | favicon_link_rules = [
10 |     'icon',
11 |     'shortcut icon',
12 |     'apple-touch-icon',
13 |     'apple-touch-icon-precomposed',
14 | ]
15 | 
16 | meta_names = ['msapplication-TileImage', 'og:image']
17 | 
18 | headers = {
19 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
20 | }
21 | 
22 | 
23 | def get_favicon_link(url, html):
24 |     """
25 |     获取网站图标链接
26 |     :param url:
27 |     :param html:
28 |     :return:
29 |     """
30 | 
31 |     # 正则匹配网站源码是否包含图标链接
32 |     soup = BeautifulSoup(html, features='html.parser')
33 | 
34 |     # 查找link标签
35 |     for rule in favicon_link_rules:
36 |         favicon_tag = soup.find('link', attrs={'rel': lambda r: r and r.lower() == rule, 'href': True})
37 |         if favicon_tag:
38 |             favicon_href = favicon_tag.get('href', '').strip()
39 |             return fmt_link(url, favicon_href)
40 | 
41 |     # 查找meta标签
42 |     for meta_tag in soup.find_all('meta', attrs={'content': True}):
43 |         meta_type = meta_tag.get('name') or meta_tag.get('property') or ''.lower()
44 |         for name in meta_names:
45 |             if meta_type == name.lower():
46 |                 favicon_href = meta_tag.get('href', '').strip()
47 |                 return fmt_link(url, favicon_href)
48 | 
49 |     # 请求根目录下是否存在/favicon.ico文件
50 |     root_icon_link = get_root_dir_icon(url)
51 |     if root_icon_link:
52 |         return root_icon_link, 'ico'
53 | 
54 |     return None, None
55 | 
56 | 
57 | def fmt_link(website_url, href):
58 |     """
59 |     格式化标签
60 |     :param website_url:
61 |     :param href:
62 |     :return: favicon_link, ext
63 |     """
64 | 
65 |     if not href or href.startswith('data:image/'):
66 |         return None, None
67 | 
68 |     if not urlparse(href).netloc:
69 |         href = urljoin(website_url, href)
70 | 
71 |     if urlparse(href).netloc:
72 |         url_parsed = href
73 |     else:
74 |         url_parsed = urljoin(website_url, href)
75 | 
76 |     url_parsed = urlparse(url_parsed, scheme=urlparse(website_url).scheme)
77 |     _, ext = os.path.splitext(url_parsed.path)
78 |     favicon_url = url_parsed.geturl()
79 |     try:
80 |         response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers)
81 |         if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
82 |             return favicon_url, ext[1:].lower()
83 |     except Exception as e:
84 |         return None, None
85 |     return None, None
86 | 
87 | 
88 | def get_root_dir_icon(url):
89 |     try:
90 |         parsed = urlparse(url)
91 |         favicon_url = parsed.scheme + "://" + parsed.netloc + '/favicon.ico'
92 |         response = requests.get(favicon_url, timeout=30, allow_redirects=True, verify=False, headers=headers)
93 |         if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
94 |             return response.url
95 |     except Exception as e:
96 |         return
97 | 
98 |     return
99 | 


--------------------------------------------------------------------------------
/services/engine/worker/library/helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import bisect
  4 | import hashlib
  5 | import math
  6 | import os
  7 | import random
  8 | import uuid
  9 | from datetime import datetime
 10 | 
 11 | import requests
 12 | from html2text import HTML2Text
 13 | 
 14 | from webs.api.models.db_proxy import subtask_model_proxy
 15 | from webs.core.requests import web_client
 16 | from worker.library.favicon import get_favicon_link
 17 | 
 18 | 
 19 | class WeightedRandomGenerator(object):
 20 |     def __init__(self, weights):
 21 |         print(weights)
 22 |         self.weights = weights
 23 |         self.totals = []
 24 |         running_total = 0
 25 | 
 26 |         for w in weights:
 27 |             running_total += w['score']
 28 |             self.totals.append(running_total)
 29 | 
 30 |     def spawn(self):
 31 |         rnd = random.random() * self.totals[-1]
 32 |         index = bisect.bisect_right(self.totals, rnd)
 33 |         return self.weights[index]
 34 | 
 35 |     def __call__(self):
 36 |         return self.spawn()
 37 | 
 38 | 
 39 | def split_urls(urls):
 40 |     """对url列表进行拆分"""
 41 |     if len(urls) > 100:
 42 |         m = len(urls) // 100
 43 |         n = int(math.ceil(len(urls) / float(m)))
 44 |         chunk_list = [urls[i:i + n] for i in range(0, len(urls), n)]
 45 |     else:
 46 |         chunk_list = [urls]
 47 | 
 48 |     return chunk_list
 49 | 
 50 | 
 51 | def send(schedule_task_id, url_nested_list, server_info, options):
 52 |     # 创建子任务模型
 53 |     subtask_obj = subtask_model_proxy.create(schedule_task_id, server_id=server_info['server_id'])
 54 | 
 55 |     # 发送请求
 56 |     try:
 57 |         response = web_client.request(
 58 |             server=server_info['server_name'],
 59 |             url=server_info['server_address'] + '/crawl_tasks',
 60 |             method='POST', timeout=60,
 61 |             json={
 62 |                 'subtask_id': subtask_obj.id,
 63 |                 'url_nested_list': url_nested_list,
 64 |                 'options': options
 65 |             }
 66 |         )
 67 |         failure_msg = '' if response['status'] is True else response['error']['message']
 68 |     except Exception as e:
 69 |         failure_msg = e.message
 70 |     if failure_msg:
 71 |         # 设置子任务失败原因
 72 |         subtask_model_proxy.set_many_attr(obj=subtask_obj, fields_v={
 73 |             'finished': True,
 74 |             'finished_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 75 |             'delivery_failure_msg': failure_msg
 76 |         })
 77 | 
 78 | 
 79 | def extract_text(content):
 80 |     """
 81 |     提取网页正文
 82 |     :param content:
 83 |     :return:
 84 |     """
 85 | 
 86 |     h = HTML2Text(bodywidth=0)
 87 |     h.ignore_links = True
 88 |     h.ignore_images = True
 89 |     h.ignore_tables = True
 90 |     h.ignore_emphasis = True
 91 |     try:
 92 |         result = h.handle(content).replace('*', '').replace('\n\n', '\n')
 93 |     except Exception as e:
 94 |         result = None
 95 |     return '' if result == '\n' else result
 96 | 
 97 | 
 98 | def save_favicon(url, html):
 99 |     """
100 |     保存网站图标
101 |     :param url:
102 |     :param html:
103 |     :return:
104 |     """
105 |     favicon_link, icon_ext = get_favicon_link(url, html)
106 |     if favicon_link:
107 |         try:
108 |             response = requests.get(favicon_link, stream=True, timeout=10)
109 |         except Exception as e:
110 |             return None, None
111 |         temp_filename = str(uuid.uuid4())
112 |         save_path = '/usr/src/app/screenshots/{}.{}'.format(temp_filename, icon_ext)
113 |         with open(save_path, 'wb+') as image:
114 |             for chunk in response.iter_content(1024):
115 |                 image.write(chunk)
116 |             image.seek(0)
117 |             favicon_md5 = hashlib.md5(image.read()).hexdigest()
118 |         os.rename(save_path, '/usr/src/app/screenshots/{}.{}'.format(favicon_md5, icon_ext))
119 |         return favicon_md5, favicon_link
120 |     return None, None
121 | 
122 | 
123 | def remove_files(path, file_ids):
124 |     """
125 |     文件
126 |     :return:
127 |     """
128 | 
129 |     for file_id in file_ids:
130 |         try:
131 |             os.remove(f'/usr/src/app/{path}/{file_id}')
132 |         except FileNotFoundError as e:
133 |             pass
134 | 


--------------------------------------------------------------------------------
/services/engine/worker/result.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import re
  5 | 
  6 | from rpc.client.callback_client import CallbackClient
  7 | from wappalyzer import wappalyzer_handler
  8 | from webs.api.models.db_proxy import result_model_proxy, task_model_proxy
  9 | from webs.core.requests import web_client
 10 | from worker import celery_app
 11 | from worker.library.helper import extract_text, save_favicon
 12 | 
 13 | 
 14 | def callback_http(callback_address, task_obj, result, finished):
 15 |     """
 16 |     回调方式为http
 17 |     :return:
 18 |     """
 19 |     try:
 20 |         response = web_client.request(
 21 |             server='callback', method='POST',
 22 |             url=callback_address,
 23 |             timeout=60, json={
 24 |                 'customer_id': task_obj.customer_id,
 25 |                 'extra_data': task_obj.extra_data,
 26 |                 'task_id': task_obj.id,
 27 |                 'finished': finished,
 28 |                 'result': result
 29 |             }
 30 |         )
 31 |         failure_msg = '' if response['status'] is True else response['error']['message']
 32 |     except Exception as e:
 33 |         failure_msg = e.message
 34 |     if failure_msg and result.get('result_id'):
 35 |         result_model_proxy.set_attr_by_id(result['result_id'], 'callback_failure_msg', failure_msg)
 36 | 
 37 | 
 38 | def callback_grpc(callback_address, task_obj, result):
 39 |     """
 40 |     回调方式为rpc
 41 |     :return:
 42 |     """
 43 | 
 44 |     callback_client = CallbackClient(rpc_server=callback_address)
 45 |     callback_client.callback_save_result(task_obj, result)
 46 | 
 47 | 
 48 | @celery_app.task(name='save_base_result')
 49 | def save_base_result_by_grpc(**kwargs):
 50 |     """
 51 |     异步回收相关爬取数据
 52 |     :param kwargs:
 53 |     :return:
 54 |     """
 55 | 
 56 |     task_obj = task_model_proxy.query_task_obj_by_subtask(subtask_id=kwargs['subtask_id'])
 57 |     if not task_obj:
 58 |         return
 59 | 
 60 |     # 解析网站编码
 61 |     try:
 62 |         m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I) \
 63 |             .search(kwargs.get('content', ''))
 64 |         if m and m.lastindex == 2: kwargs['charset'] = m.group(2).lower()
 65 |     except Exception as e:
 66 |         pass
 67 | 
 68 |     # 解析网站指纹
 69 |     if task_obj.crawl_options.get('wappalyzer') is True:
 70 |         kwargs['wappalyzer_results'] = wappalyzer_handler.discover_technologies(
 71 |             url=kwargs.get('current_url'), html=kwargs.get('content') or '',
 72 |             cookies=kwargs.get('cookies', []), headers=json.loads(kwargs.get('response_headers', {}))
 73 |         )
 74 | 
 75 |     # 提取网站内容
 76 |     if task_obj.crawl_options.get('extract_text') is True:
 77 |         kwargs['text'] = extract_text(kwargs.get('content', ''))
 78 | 
 79 |     # 下载网站图标
 80 |     kwargs['favicon_md5'], kwargs['favicon_link'] = None, None
 81 |     if task_obj.crawl_options.get('extract_favicon') is True and kwargs.get('http_code') is not None:
 82 |         try:
 83 |             kwargs['favicon_md5'], kwargs['favicon_link'] = save_favicon(
 84 |                 kwargs.get('current_url', '') or kwargs['url_address'], kwargs.get('content', ''))
 85 |         except Exception as e:
 86 |             pass
 87 | 
 88 |     # 处理response_headers
 89 |     kwargs['response_headers'] = json.loads(kwargs.get('response_headers', {}))
 90 | 
 91 |     ###### 回收数据并存储
 92 |     result_obj = result_model_proxy.save_model_by_grpc(**kwargs)
 93 |     kwargs.update({'result_id': result_obj.id})
 94 | 
 95 |     ###### http回调/rpc回调
 96 |     if task_obj.crawl_options.get('callback_type'):
 97 |         celery_app.send_task(
 98 |             name='delay_callback', queue='base_result',
 99 |             kwargs={'subtask_id': kwargs.pop('subtask_id'), 'result': kwargs, 'finished': False}
100 |         )
101 | 
102 | 
103 | @celery_app.task(name='delay_callback')
104 | def delay_callback(subtask_id, result, finished):
105 |     """
106 |     结果回调
107 |     :param subtask_id:
108 |     :param result:
109 |     :param finished:
110 |     :return:
111 |     """
112 | 
113 |     task_obj = task_model_proxy.query_task_obj_by_subtask(subtask_id=subtask_id)
114 | 
115 |     # http回调
116 |     if task_obj.crawl_options.get('callback_type') == 'http':
117 |         callback_http(
118 |             task_obj=task_obj, result=result, finished=finished,
119 |             callback_address=task_obj.crawl_options.get('callback_address'))
120 | 
121 |     # grpc回调
122 |     if task_obj.crawl_options.get('callback_type') == 'rpc':
123 |         callback_grpc(
124 |             task_obj=task_obj, result=result,
125 |             callback_address=task_obj.crawl_options.get('callback_address'))
126 | 


--------------------------------------------------------------------------------
/services/spider/.dockerignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.env
 3 | .dockerignore
 4 | Dockerfile-dev
 5 | Dockerfile-prod
 6 | htmlcov
 7 | celerybeat-shcedule
 8 | icons
 9 | files
10 | scheduler.lock
11 | celerybeat.pid
12 | user_data
13 | screenshots
14 | hars


--------------------------------------------------------------------------------
/services/spider/Dockerfile-dev:
--------------------------------------------------------------------------------
 1 | # base image
 2 | FROM harbor.socmap.net/crawloop/playwright-xvfb:v1.0.0
 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
 4 | 
 5 | # update apt source
 6 | COPY ./sources.list /etc/apt/
 7 | 
 8 | # install netcat
 9 | RUN apt-get update && \
10 |     apt-get -y install netcat && \
11 |     apt-get clean
12 | 
13 | # set timezone
14 | ENV TZ=Asia/Shanghai
15 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
16 | 
17 | # set working directory
18 | RUN mkdir -p /usr/src/app
19 | WORKDIR /usr/src/app
20 | 
21 | # add and install requirements
22 | COPY ./requirements.txt /usr/src/app/requirements.txt
23 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \
24 |     pip install -r requirements.txt -i https://pypi.douban.com/simple
25 | 
26 | # add app
27 | COPY . /usr/src/app
28 | 
29 | # run server
30 | CMD ["/usr/src/app/entrypoint.sh"]
31 | 


--------------------------------------------------------------------------------
/services/spider/Dockerfile-prod:
--------------------------------------------------------------------------------
 1 | FROM harbor.socmap.net/crawloop/playwright-xvfb:v1.0.0
 2 | 
 3 | WORKDIR /usr/src/app
 4 | 
 5 | COPY ./requirements.txt /usr/src/app
 6 | 
 7 | RUN apt update && \
 8 |     apt-get -y install netcat && \
 9 |     rm -rf /var/lib/apt/lists/*
10 | 
11 | 
12 | # set timezone
13 | ENV TZ=Asia/Shanghai
14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
15 | 
16 | RUN pip install --upgrade pip -i https://pypi.douban.com/simple && \
17 |     pip install -r requirements.txt -i https://pypi.douban.com/simple
18 | 
19 | COPY . /usr/src/app
20 | 
21 | RUN sh build.sh
22 | 
23 | CMD ["/usr/src/app/entrypoint.sh"]
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/services/spider/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 清除缓存目录
 4 | find . -type d -name __pycache__ | xargs rm -rf
 5 | 
 6 | # 编译代码
 7 | python3 compile.py build_ext --inplace
 8 | if [ $? -ne 0 ]; then
 9 |   exit 1
10 | fi
11 | 
12 | # 将.so文件改名
13 | find ./rpc -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
14 | find ./webs -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
15 | find ./worker -name '*.so' | awk -F '.cpython-37m-x86_64-linux-gnu' '{print "mv "$0" "$1$2}' | sh
16 | 
17 | # 删除.py文件
18 | find ./rpc -name '*.py' | xargs rm -f
19 | find ./webs -name '*.py' | xargs rm -f
20 | find ./worker -name '*.py' | xargs rm -f
21 | 
22 | # 清除不需要的文件
23 | rm -rf build
24 | rm -f .gitignore
25 | rm -f compile.py
26 | rm -f build.sh
27 | 


--------------------------------------------------------------------------------
/services/spider/build_sentry_ini.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import os
 5 | 
 6 | from configobj import ConfigObj
 7 | 
 8 | log_ini = ConfigObj("gunicorn_logging.ini", encoding='UTF8')
 9 | log_ini['handler_sentry']['args'] = json.dumps((os.getenv('SENTRY_DSN'),), ensure_ascii=False)
10 | log_ini.write()
11 | 


--------------------------------------------------------------------------------
/services/spider/compile.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from Cython.Build import cythonize
 4 | from Cython.Distutils import build_ext
 5 | from setuptools import setup
 6 | from setuptools.extension import Extension
 7 | 
 8 | setup(
 9 |     ext_modules=cythonize(
10 |         [
11 |             Extension('rpc.*', ['rpc/*.py']),
12 |             Extension('rpc.pb.*', ['rpc/pb/*.py']),
13 |             Extension('rpc.client.*', ['rpc/client/*.py']),
14 |             Extension('webs.*', ['webs/*.py']),
15 |             Extension('webs.api.*', ['webs/api/*.py']),
16 |             Extension('webs.api.bizs.*', ['webs/api/bizs/*.py']),
17 |             Extension('webs.api.exceptions.*', ['webs/api/exceptions/*.py']),
18 |             Extension('webs.api.models*', ['webs/api/models/*.py']),
19 |             Extension('webs.api.models.db_proxy.*', ['webs/api/models/db_proxy/*.py']),
20 |             Extension('webs.api.schemas.*', ['webs/api/schemas/*.py']),
21 |             Extension('webs.api.utils.*', ['webs/api/utils/*.py']),
22 |             Extension('webs.api.views.*', ['webs/api/views/*.py']),
23 |             Extension('worker.*', ['worker/*.py']),
24 |             Extension('worker.library.*', ['worker/library/*.py']),
25 |         ],
26 |         build_dir='build',
27 |         compiler_directives=dict(
28 |             always_allow_keywords=True, language_level=3
29 |         )
30 |     ),
31 |     cmdclass=dict(
32 |         build_ext=build_ext
33 |     )
34 | )
35 | 


--------------------------------------------------------------------------------
/services/spider/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # web服务
 4 | if [ "$ENDPOINT" = "web" ]; then
 5 |     # 开发环境
 6 |   if [ "$FLASK_ENV" = "development" ]; then
 7 |     flask run -h 0.0.0.0 -p 5000
 8 | 
 9 |   # 生产环境
10 |   elif [ "$FLASK_ENV" = "production" ]; then
11 |     python build_sentry_ini.py
12 |     gunicorn --worker-tmp-dir /dev/shm --log-config gunicorn_logging.ini -c gunicorn_config.py manage:app
13 |   fi
14 | 
15 | # 爬取
16 | elif [ "$ENDPOINT" = "fetch" ]; then
17 |     # 开启虚拟显示器
18 |   echo "开启xvfb"
19 |   rm -rf /tmp/.X99-lock
20 |   Xvfb -screen 0 1020x720x16 :99 &
21 |   export DISPLAY=:99
22 |   celery -A worker.celery_app worker -Q priority_fetch -l info -c $WORK_MAX_COUNT --prefetch-multiplier 1 --max-tasks-per-child 1 -n crawl_fetch@%h
23 | 
24 | # 保存结果
25 | elif [ "$ENDPOINT" = "results" ]; then
26 | #    celery -A worker.celery_app worker -Q results -l info -c 5 --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h
27 |     celery -A worker.celery_app worker -Q results -l info --pool=prefork --concurrency=5 --without-heartbeat --prefetch-multiplier 4 --max-tasks-per-child 100 -n results@%h
28 | fi
29 | 


--------------------------------------------------------------------------------
/services/spider/gunicorn_config.py:
--------------------------------------------------------------------------------
  1 | # Sample Gunicorn configuration file.
  2 | 
  3 | import multiprocessing as mlp
  4 | 
  5 | # 解决无限递归
  6 | import gevent.monkey
  7 | 
  8 | gevent.monkey.patch_all()
  9 | 
 10 | #
 11 | # Server socket
 12 | #
 13 | #   bind - The socket to bind.
 14 | #
 15 | #       A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'.
 16 | #       An IP is a valid HOST.
 17 | #
 18 | #   backlog - The number of pending connections. This refers
 19 | #       to the number of clients that can be waiting to be
 20 | #       served. Exceeding this number results in the client
 21 | #       getting an error when attempting to connect. It should
 22 | #       only affect servers under significant load.
 23 | #
 24 | #       Must be a positive integer. Generally set in the 64-2048
 25 | #       range.
 26 | #
 27 | 
 28 | bind = '0.0.0.0:5000'
 29 | backlog = 2048
 30 | 
 31 | #
 32 | # Worker processes
 33 | #
 34 | #   workers - The number of worker processes that this server
 35 | #       should keep alive for handling requests.
 36 | #
 37 | #       A positive integer generally in the 2-4 x $(NUM_CORES)
 38 | #       range. You'll want to vary this a bit to find the best
 39 | #       for your particular application's work load.
 40 | #
 41 | #   worker_class - The type of workers to use. The default
 42 | #       sync class should handle most 'normal' types of work
 43 | #       loads. You'll want to read
 44 | #       http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type
 45 | #       for information on when you might want to choose one
 46 | #       of the other worker classes.
 47 | #
 48 | #       A string referring to a Python path to a subclass of
 49 | #       gunicorn.workers.base.Worker. The default provided values
 50 | #       can be seen at
 51 | #       http://docs.gunicorn.org/en/latest/settings.html#worker-class
 52 | #
 53 | #   worker_connections - For the eventlet and gevent worker classes
 54 | #       this limits the maximum number of simultaneous clients that
 55 | #       a single process can handle.
 56 | #
 57 | #       A positive integer generally set to around 1000.
 58 | #
 59 | #   timeout - If a worker does not notify the master process in this
 60 | #       number of seconds it is killed and a new worker is spawned
 61 | #       to replace it.
 62 | #
 63 | #       Generally set to thirty seconds. Only set this noticeably
 64 | #       higher if you're sure of the repercussions for sync workers.
 65 | #       For the non sync workers it just means that the worker
 66 | #       process is still communicating and is not tied to the length
 67 | #       of time required to handle a single request.
 68 | #
 69 | #   keepalive - The number of seconds to wait for the next request
 70 | #       on a Keep-Alive HTTP connection.
 71 | #
 72 | #       A positive integer. Generally set in the 1-5 seconds range.
 73 | #
 74 | 
 75 | # Number of processes
 76 | workers = mlp.cpu_count() * 2 + 1
 77 | 
 78 | # Threads
 79 | threads = mlp.cpu_count() * 2
 80 | 
 81 | worker_class = 'gevent'
 82 | worker_connections = 1000
 83 | timeout = 30
 84 | keepalive = 2
 85 | 
 86 | #
 87 | #   spew - Install a trace function that spews every line of Python
 88 | #       that is executed when running the server. This is the
 89 | #       nuclear option.
 90 | #
 91 | #       True or False
 92 | #
 93 | 
 94 | spew = False
 95 | 
 96 | #
 97 | # Server mechanics
 98 | #
 99 | #   daemon - Detach the main Gunicorn process from the controlling
100 | #       terminal with a standard fork/fork sequence.
101 | #
102 | #       True or False
103 | #
104 | #   raw_env - Pass environment variables to the execution environment.
105 | #
106 | #   pidfile - The path to a pid file to write
107 | #
108 | #       A path string or None to not write a pid file.
109 | #
110 | #   user - Switch worker processes to run as this user.
111 | #
112 | #       A valid user id (as an integer) or the name of a user that
113 | #       can be retrieved with a call to pwd.getpwnam(value) or None
114 | #       to not change the worker process user.
115 | #
116 | #   group - Switch worker process to run as this group.
117 | #
118 | #       A valid group id (as an integer) or the name of a user that
119 | #       can be retrieved with a call to pwd.getgrnam(value) or None
120 | #       to change the worker processes group.
121 | #
122 | #   umask - A mask for file permissions written by Gunicorn. Note that
123 | #       this affects unix socket permissions.
124 | #
125 | #       A valid value for the os.umask(mode) call or a string
126 | #       compatible with int(value, 0) (0 means Python guesses
127 | #       the base, so values like "0", "0xFF", "0022" are valid
128 | #       for decimal, hex, and octal representations)
129 | #
130 | #   tmp_upload_dir - A directory to store temporary request data when
131 | #       requests are read. This will most likely be disappearing soon.
132 | #
133 | #       A path to a directory where the process owner can write. Or
134 | #       None to signal that Python should choose one on its own.
135 | #
136 | 
137 | #
138 | #   Logging
139 | #
140 | #   logfile - The path to a log file to write to.
141 | #
142 | #       A path string. "-" means log to stdout.
143 | #
144 | #   loglevel - The granularity of log output
145 | #
146 | #       A string of "debug", "info", "warning", "error", "critical"
147 | #
148 | 
149 | errorlog = '-'
150 | loglevel = 'error'
151 | accesslog = '-'
152 | access_log_format = '{"request_address": "%(h)s", ' \
153 |                     '"request_time": "%(t)s", ' \
154 |                     '"request": "%(r)s", ' \
155 |                     '"http_status_code": "%(s)s", ' \
156 |                     '"http_request_url": "%(U)s", ' \
157 |                     '"http_query_string": "%(q)s", ' \
158 |                     '"request_headers": {' \
159 |                     '"content-type": "%({content-type}i)s", ' \
160 |                     '"content-length": "%({content-length}i)s", ' \
161 |                     '"user-agent": "%(a)s"' \
162 |                     '}}'
163 | 


--------------------------------------------------------------------------------
/services/spider/gunicorn_logging.ini:
--------------------------------------------------------------------------------
 1 | # Logging configuration
 2 | 
 3 | [loggers]
 4 | keys = root, gunicorn.access, gunicorn.error
 5 | 
 6 | [handlers]
 7 | keys = access, error, sentry
 8 | 
 9 | [formatters]
10 | keys = json, generic
11 | 
12 | # Root logger
13 | # The root logger sends messages to the console and to Sentry.
14 | [logger_root]
15 | handlers = error, sentry
16 | 
17 | # Gunicorn loggers
18 | # Gunicorn logging is configured with two loggers: 'gunicorn.access' and 'gunicorn.error'.
19 | # The access log is sent to stdout and the error log is sent to stderr, both without propagation.
20 | # Only the critical logger has a handler to send messages to Sentry.
21 | 
22 | [logger_gunicorn.access]
23 | level = INFO
24 | handlers = access
25 | propagate = 0
26 | qualname = gunicorn.access
27 | 
28 | [logger_gunicorn.error]
29 | level = ERROR
30 | handlers = error, sentry
31 | propagate = 0
32 | qualname = gunicorn.error
33 | 
34 | # Handlers
35 | [handler_access]
36 | class = StreamHandler
37 | formatter = json
38 | args = (sys.stdout, )
39 | 
40 | [handler_error]
41 | class = StreamHandler
42 | formatter = json
43 | args = (sys.stderr,)
44 | 
45 | [handler_sentry]
46 | class = raven.handlers.logging.SentryHandler
47 | level = ERROR
48 | formatter = generic
49 | sentry_dsn = example
50 | args = [%(sentry_dsn)s]
51 | 
52 | [formatter_generic]
53 | format = [sccp][%(levelname)s] [%(name)s]: %(message)s
54 | [formatter_json]
55 | class = webs.api.utils.loggers.JSONFormatter


--------------------------------------------------------------------------------
/services/spider/manage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask.cli import FlaskGroup
 4 | from webs import create_app
 5 | 
 6 | app = create_app()
 7 | cli = FlaskGroup(create_app=create_app)
 8 | 
 9 | if __name__ == '__main__':
10 |     cli()
11 | 


--------------------------------------------------------------------------------
/services/spider/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/services/spider/migrations/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # template used to generate migration files
 5 | # file_template = %%(rev)s_%%(slug)s
 6 | 
 7 | # set to 'true' to run the environment during
 8 | # the 'revision' command, regardless of autogenerate
 9 | # revision_environment = false
10 | 
11 | 
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic
15 | 
16 | [handlers]
17 | keys = console
18 | 
19 | [formatters]
20 | keys = generic
21 | 
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 | 
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 | 
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 | 
37 | [handler_console]
38 | class = StreamHandler
39 | args = (sys.stderr,)
40 | level = NOTSET
41 | formatter = generic
42 | 
43 | [formatter_generic]
44 | format = %(levelname)-5.5s [%(name)s] %(message)s
45 | datefmt = %H:%M:%S
46 | 


--------------------------------------------------------------------------------
/services/spider/migrations/env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | import logging
 4 | from logging.config import fileConfig
 5 | 
 6 | from sqlalchemy import engine_from_config
 7 | from sqlalchemy import pool
 8 | 
 9 | from alembic import context
10 | 
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | fileConfig(config.config_file_name)
18 | logger = logging.getLogger('alembic.env')
19 | 
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | from flask import current_app
25 | config.set_main_option('sqlalchemy.url',
26 |                        current_app.config.get('SQLALCHEMY_DATABASE_URI'))
27 | target_metadata = current_app.extensions['migrate'].db.metadata
28 | 
29 | # other values from the config, defined by the needs of env.py,
30 | # can be acquired:
31 | # my_important_option = config.get_main_option("my_important_option")
32 | # ... etc.
33 | 
34 | 
35 | def run_migrations_offline():
36 |     """Run migrations in 'offline' mode.
37 | 
38 |     This configures the context with just a URL
39 |     and not an Engine, though an Engine is acceptable
40 |     here as well.  By skipping the Engine creation
41 |     we don't even need a DBAPI to be available.
42 | 
43 |     Calls to context.execute() here emit the given string to the
44 |     script output.
45 | 
46 |     """
47 |     url = config.get_main_option("sqlalchemy.url")
48 |     context.configure(
49 |         url=url, target_metadata=target_metadata, literal_binds=True
50 |     )
51 | 
52 |     with context.begin_transaction():
53 |         context.run_migrations()
54 | 
55 | 
56 | def run_migrations_online():
57 |     """Run migrations in 'online' mode.
58 | 
59 |     In this scenario we need to create an Engine
60 |     and associate a connection with the context.
61 | 
62 |     """
63 | 
64 |     # this callback is used to prevent an auto-migration from being generated
65 |     # when there are no changes to the schema
66 |     # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
67 |     def process_revision_directives(context, revision, directives):
68 |         if getattr(config.cmd_opts, 'autogenerate', False):
69 |             script = directives[0]
70 |             if script.upgrade_ops.is_empty():
71 |                 directives[:] = []
72 |                 logger.info('No changes in schema detected.')
73 | 
74 |     connectable = engine_from_config(
75 |         config.get_section(config.config_ini_section),
76 |         prefix='sqlalchemy.',
77 |         poolclass=pool.NullPool,
78 |     )
79 | 
80 |     with connectable.connect() as connection:
81 |         context.configure(
82 |             connection=connection,
83 |             target_metadata=target_metadata,
84 |             process_revision_directives=process_revision_directives,
85 |             **current_app.extensions['migrate'].configure_args
86 |         )
87 | 
88 |         with context.begin_transaction():
89 |             context.run_migrations()
90 | 
91 | 
92 | if context.is_offline_mode():
93 |     run_migrations_offline()
94 | else:
95 |     run_migrations_online()
96 | 


--------------------------------------------------------------------------------
/services/spider/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/services/spider/migrations/versions/81a88acb3641_记录cookies.py:
--------------------------------------------------------------------------------
 1 | """记录cookies
 2 | 
 3 | Revision ID: 81a88acb3641
 4 | Revises: 8efa2b9dcc87
 5 | Create Date: 2020-12-22 15:37:26.700404
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '81a88acb3641'
14 | down_revision = '8efa2b9dcc87'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('results', sa.Column('cookies', postgresql.JSONB(astext_type=sa.Text()), nullable=True))
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade():
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('results', 'cookies')
28 |     # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/services/spider/migrations/versions/8efa2b9dcc87_init.py:
--------------------------------------------------------------------------------
 1 | """init
 2 | 
 3 | Revision ID: 8efa2b9dcc87
 4 | Revises: 
 5 | Create Date: 2020-12-08 10:22:43.545415
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '8efa2b9dcc87'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('crawl_tasks',
22 |     sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False),
23 |     sa.Column('subtask_id', sa.Integer(), nullable=False),
24 |     sa.Column('url_nested_list', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
25 |     sa.Column('process_state', sa.String(length=30), server_default='readying', nullable=True),
26 |     sa.Column('failure_url_ids', sa.ARRAY(sa.Integer()), server_default='{}', nullable=True),
27 |     sa.Column('finished_at', sa.TIMESTAMP(), nullable=True),
28 |     sa.Column('options', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
29 |     sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
30 |     sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
31 |     sa.PrimaryKeyConstraint('id')
32 |     )
33 |     op.create_index(op.f('ix_crawl_tasks_create_time'), 'crawl_tasks', ['create_time'], unique=False)
34 |     op.create_index(op.f('ix_crawl_tasks_subtask_id'), 'crawl_tasks', ['subtask_id'], unique=False)
35 |     op.create_index(op.f('ix_crawl_tasks_update_time'), 'crawl_tasks', ['update_time'], unique=False)
36 |     op.create_table('results',
37 |     sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False),
38 |     sa.Column('subtask_id', sa.Integer(), nullable=False),
39 |     sa.Column('url_id', sa.Integer(), nullable=False),
40 |     sa.Column('url_address', sa.String(length=1024), nullable=False),
41 |     sa.Column('http_code', sa.Integer(), nullable=True),
42 |     sa.Column('title', sa.Text(), nullable=True),
43 |     sa.Column('content', sa.Text(), nullable=True),
44 |     sa.Column('current_url', sa.String(length=1024), nullable=True),
45 |     sa.Column('redirect_chain', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
46 |     sa.Column('response_headers', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
47 |     sa.Column('har_uuid', sa.String(length=128), nullable=True),
48 |     sa.Column('screenshot_id', sa.String(length=128), nullable=True),
49 |     sa.Column('create_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
50 |     sa.Column('update_time', sa.TIMESTAMP(), server_default=sa.text('now()'), nullable=True),
51 |     sa.PrimaryKeyConstraint('id')
52 |     )
53 |     op.create_index(op.f('ix_results_create_time'), 'results', ['create_time'], unique=False)
54 |     op.create_index(op.f('ix_results_subtask_id'), 'results', ['subtask_id'], unique=False)
55 |     op.create_index(op.f('ix_results_update_time'), 'results', ['update_time'], unique=False)
56 |     op.create_index(op.f('ix_results_url_id'), 'results', ['url_id'], unique=False)
57 |     # ### end Alembic commands ###
58 | 
59 | 
60 | def downgrade():
61 |     # ### commands auto generated by Alembic - please adjust! ###
62 |     op.drop_index(op.f('ix_results_url_id'), table_name='results')
63 |     op.drop_index(op.f('ix_results_update_time'), table_name='results')
64 |     op.drop_index(op.f('ix_results_subtask_id'), table_name='results')
65 |     op.drop_index(op.f('ix_results_create_time'), table_name='results')
66 |     op.drop_table('results')
67 |     op.drop_index(op.f('ix_crawl_tasks_update_time'), table_name='crawl_tasks')
68 |     op.drop_index(op.f('ix_crawl_tasks_subtask_id'), table_name='crawl_tasks')
69 |     op.drop_index(op.f('ix_crawl_tasks_create_time'), table_name='crawl_tasks')
70 |     op.drop_table('crawl_tasks')
71 |     # ### end Alembic commands ###
72 | 


--------------------------------------------------------------------------------
/services/spider/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.4.3
 2 | amqp==2.6.1
 3 | appdirs==1.4.4
 4 | billiard==3.6.3.0
 5 | celery==4.3.0
 6 | certifi==2020.11.8
 7 | chardet==3.0.4
 8 | click==7.1.2
 9 | configobj==5.0.6
10 | Cython==0.29.21
11 | Flask==1.1.2
12 | Flask-Migrate==2.4.0
13 | Flask-Redis==0.3.0
14 | Flask-SQLAlchemy==2.3.2
15 | gevent==1.4.0
16 | greenlet==0.4.15
17 | grpcio==1.33.2
18 | grpcio-tools==1.33.2
19 | gunicorn==19.9.0
20 | idna==2.8
21 | importlib-metadata==2.0.0
22 | itsdangerous==1.1.0
23 | Jinja2==2.11.2
24 | kombu==4.6.11
25 | Mako==1.1.3
26 | MarkupSafe==1.1.1
27 | marshmallow==2.19.2
28 | protobuf==3.14.0
29 | psutil==5.7.3
30 | psycopg2-binary==2.7.6.1
31 | pyee==7.0.4
32 | pyppeteer==0.2.2
33 | python-dateutil==2.8.1
34 | python-editor==1.0.4
35 | pytz==2020.4
36 | raven==6.10.0
37 | redis==3.5.3
38 | requests==2.22.0
39 | six==1.15.0
40 | SQLAlchemy==1.3.20
41 | tqdm==4.52.0
42 | urllib3==1.25.11
43 | vine==1.3.0
44 | webargs==4.0.0
45 | websockets==8.1
46 | Werkzeug==1.0.1
47 | zipp==3.4.0
48 | zope.event==4.5.0
49 | zope.interface==5.2.0
50 | 


--------------------------------------------------------------------------------
/services/spider/rpc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/rpc/client/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/rpc/client/result.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import os
  4 | 
  5 | import grpc
  6 | 
  7 | from rpc.pb import result_pb2
  8 | from rpc.pb.result_pb2_grpc import ResultStub
  9 | 
 10 | CHUNK_SIZE = 10 * 1024
 11 | 
 12 | 
 13 | def get_file_chunks(filename, folder_path):
 14 |     yield result_pb2.StreamUploadPictureRequest(filename=filename)
 15 |     with open(f'/usr/src/app/{folder_path}/' + filename, 'rb') as f:
 16 |         while True:
 17 |             piece = f.read(CHUNK_SIZE)
 18 |             if len(piece) == 0:
 19 |                 return
 20 |             yield result_pb2.StreamUploadPictureRequest(file_data={"buffer": piece})
 21 | 
 22 | 
 23 | def remove_file(file_path):
 24 |     """
 25 |     删除文件
 26 |     :param file_path:
 27 |     :return:
 28 |     """
 29 | 
 30 |     try:
 31 |         os.remove(file_path)
 32 |     except (NotImplementedError, FileNotFoundError):
 33 |         pass
 34 | 
 35 | 
 36 | class ResultClient(object):
 37 | 
 38 |     def __init__(self, rpc_server):
 39 |         # RPC服务器信道
 40 |         channel = grpc.insecure_channel(target=f'{rpc_server}', options=[
 41 |             ('grpc.max_send_message_length', int(os.getenv('GRPC_MAX_SEND_MESSAGE_LENGTH', 200)) * 1024 * 1024),
 42 |             ('grpc.max_receive_message_length', int(os.getenv('GRPC_MAX_RECEIVE_MESSAGE_LENGTH', 200)) * 1024 * 1024),
 43 |         ])
 44 | 
 45 |         # 获取Result grpc服务对象
 46 |         self.stub = ResultStub(channel)
 47 | 
 48 |     def save_base_result(self, subtask_id, url_id, url_address, finished_at, **kwargs):
 49 |         """保存爬虫基本信息"""
 50 | 
 51 |         # 返回头部序列化
 52 |         kwargs['response_headers'] = self.dic2json(kwargs.pop('response_headers', {}))
 53 | 
 54 |         # 生成状态码
 55 |         kwargs['http_code'] = kwargs['redirect_chain'][-1]['redirect_http_code'] if kwargs['redirect_chain'] else None
 56 | 
 57 |         # 去除firefox和chrome默认content
 58 |         if kwargs['content'] and (kwargs['content'].startswith(
 59 |                 '<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US">')
 60 |                                   or kwargs['content'] == '<html><head></head><body></body></html>'):
 61 |             kwargs['content'] = None
 62 | 
 63 |         # # http交互过程序列化
 64 |         # kwargs['http_archive'] = self.dic2json(kwargs.pop('http_archive', []))
 65 |         self.stub.SaveBaseResult(
 66 |             result_pb2.SaveBaseResultRequest(
 67 |                 subtask_id=subtask_id, url_id=url_id, url_address=url_address,
 68 |                 finished_at=finished_at, **kwargs),
 69 |             timeout=30
 70 |         )
 71 | 
 72 |     def upload_screenshot(self, screenshot_name):
 73 |         """上传截图"""
 74 |         chunks_generator = get_file_chunks(screenshot_name, folder_path='screenshots')
 75 |         response = self.stub.StreamUploadPicture(chunks_generator)
 76 |         file_path = f'/usr/src/app/screenshots/{screenshot_name}'
 77 |         assert response.length == os.path.getsize(file_path)
 78 |         remove_file(file_path)
 79 | 
 80 |     def set_subtask_status(self, subtask_id, status, finished_at):
 81 |         """标记子任务爬取状态"""
 82 |         self.stub.SetSubTaskStatus(
 83 |             result_pb2.SetSubTaskStatusRequest(
 84 |                 subtask_id=subtask_id,
 85 |                 status=status,
 86 |                 finished_at=finished_at
 87 |             ),
 88 |             timeout=30
 89 |         )
 90 | 
 91 |     def upload_har_file(self, har_file_name):
 92 |         """上传har文件"""
 93 |         chunks_generator = get_file_chunks(har_file_name, folder_path='hars')
 94 |         response = self.stub.StreamUploadHarFile(chunks_generator)
 95 |         file_path = f'/usr/src/app/hars/{har_file_name}'
 96 |         assert response.length == os.path.getsize(file_path)
 97 |         remove_file(file_path)
 98 | 
 99 |     @staticmethod
100 |     def dic2json(dic):
101 |         """某些字段转换为json"""
102 |         return json.dumps(dic, ensure_ascii=False)
103 | 


--------------------------------------------------------------------------------
/services/spider/rpc/pb/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.aliyun.com/debian stretch main contrib non-free
2 | deb-src http://mirrors.aliyun.com/debian stretch main contrib non-free
3 | deb http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
4 | deb-src http://mirrors.aliyun.com/debian stretch-updates main contrib non-free
5 | deb http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free
6 | deb-src http://mirrors.aliyun.com/debian-security stretch/updates main contrib non-free


--------------------------------------------------------------------------------
/services/spider/webs/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | from flask import Flask
 6 | 
 7 | from webs.api.utils.requests import before_request_middleware, \
 8 |     after_request_middleware, teardown_appcontext_middleware
 9 | from webs.api.utils.responses import JSONResponse, app_error_handler
10 | from webs.api.utils.routers import register_routes as init_routes
11 | from webs.api.utils.settings import init_db
12 | 
13 | 
14 | def create_app():
15 |     # instantiate the app
16 |     app = Flask(__name__)
17 | 
18 |     # set config
19 |     app_settings = os.getenv('APP_SETTINGS')
20 |     app.config.from_object(app_settings)
21 | 
22 |     # register all blueprints
23 |     init_routes(app=app)
24 | 
25 |     # register custom response class
26 |     app.response_class = JSONResponse
27 | 
28 |     # register custom error handler
29 |     app_error_handler(app=app)
30 | 
31 |     # register before request middleware
32 |     before_request_middleware(app=app)
33 | 
34 |     # register after request middleware
35 |     after_request_middleware(app=app)
36 | 
37 |     # register after app context teardown middleware
38 |     teardown_appcontext_middleware(app=app)
39 | 
40 |     # set up extensions
41 |     app_db = init_db(app=app)
42 | 
43 |     # shell context for flask cli
44 |     @app.shell_context_processor
45 |     def ctx():
46 |         return {'app': app, 'db': app_db}
47 | 
48 |     return app
49 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/bizs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/bizs/crawl_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models.db_proxy import crawl_task_model_proxy
 4 | 
 5 | 
 6 | class CrawlTaskBiz(object):
 7 | 
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def create_crawl_task(self, subtask_id, url_nested_list, options={}):
12 |         """
13 |         调度爬虫
14 |         :param subtask_id:
15 |         :param url_nested_list:
16 |         :param options:
17 |         :return:
18 |         """
19 | 
20 |         # 创建CrawlTask对象
21 |         crawl_task_obj = crawl_task_model_proxy.create(
22 |             subtask_id=subtask_id, url_nested_list=url_nested_list,
23 |             process_state='readying', options=options)
24 | 
25 |         # 异步抓取
26 |         from worker import celery_app
27 |         celery_app.send_task(
28 |             name='fetch_tasks', queue='priority_fetch', priority=options['priority'],
29 |             kwargs={'crawl_task_id': crawl_task_obj.id})
30 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/exceptions/customs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from werkzeug.exceptions import BadRequest, \
  5 |     NotFound, Unauthorized, Forbidden, InternalServerError, Conflict
  6 | 
  7 | 
  8 | class CustomException(Exception):
  9 |     """Custom JSON based exception."""
 10 | 
 11 |     status_code = BadRequest.code
 12 |     message = ""
 13 | 
 14 |     def __init__(self, message=None, status_code=None):
 15 |         """
 16 |         :param status_code: response status_code
 17 |         :param message: exception message
 18 |         """
 19 | 
 20 |         Exception.__init__(self)
 21 | 
 22 |         if message is not None:
 23 |             self.message = message
 24 |         if status_code is not None:
 25 |             self.status_code = status_code
 26 | 
 27 |     def to_dict(self):
 28 |         return {
 29 |             "status": False,
 30 |             "error": {
 31 |                 "message": self.message,
 32 |                 "type": str(self.__class__.__name__)
 33 |             }
 34 |         }
 35 | 
 36 | 
 37 | class InvalidContentType(CustomException):
 38 |     """
 39 |     Raised when an invalid Content-Type is provided.
 40 |     """
 41 | 
 42 |     status_code = BadRequest.code
 43 | 
 44 | 
 45 | class UnauthorizedAPIRequest(CustomException):
 46 |     """
 47 |     Raise if the user is not authorized.  Also used if you want to use HTTP
 48 |     basic auth.
 49 |     """
 50 | 
 51 |     status_code = Unauthorized.code
 52 | 
 53 | 
 54 | class InvalidPermissions(CustomException):
 55 |     """
 56 |     Raise if the user doesn't have the permission for the requested resource
 57 |     but was authenticated.
 58 |     """
 59 | 
 60 |     status_code = Forbidden.code
 61 | 
 62 | 
 63 | class InvalidAPIRequest(CustomException):
 64 |     """
 65 |     Raised when an invalid request has been made.
 66 |     (e.g. accessed unexisting url, the schema validation did
 67 |     not pass)
 68 |     """
 69 | 
 70 |     status_code = BadRequest.code
 71 | 
 72 | 
 73 | class ServerError(CustomException):
 74 |     """
 75 |     Generic internal error.
 76 |     Inherit this error for all subsequent
 77 |     errors that are related to database.
 78 |     """
 79 | 
 80 |     status_code = InternalServerError.code
 81 | 
 82 | 
 83 | class DatabaseError(CustomException):
 84 |     """
 85 |     Generic database interaction error.
 86 |     Inherit this error for all subsequent
 87 |     errors that are related to database.
 88 |     """
 89 | 
 90 |     status_code = InternalServerError.code
 91 | 
 92 | 
 93 | class RecordNotFound(DatabaseError):
 94 |     """
 95 |     Raised when the record was not found in the database.
 96 |     """
 97 | 
 98 |     status_code = NotFound.code
 99 | 
100 | 
101 | class RecordAlreadyExists(DatabaseError):
102 |     """
103 |     Raised in the case of violation of a unique constraint.
104 |     """
105 | 
106 |     status_code = Conflict.code
107 | 
108 | 
109 | class PublishError(CustomException):
110 |     """
111 |     Raised in the case of violation of a publish error.
112 |     """
113 | 
114 |     status_code = InternalServerError.code
115 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .base_model import db, redis_store
4 | from .crawl_task import CrawlTask
5 | from .result import Result
6 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/base_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from flask_sqlalchemy import SQLAlchemy
4 | from flask_redis import FlaskRedis
5 | 
6 | db = SQLAlchemy()
7 | redis_store = FlaskRedis()
8 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/crawl_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | 底层爬虫子任务与Url映射关系
 6 | """
 7 | 
 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, ARRAY
 9 | from sqlalchemy.dialects.postgresql import JSONB
10 | 
11 | from webs.api.models import db
12 | 
13 | 
14 | class CrawlTask(db.Model):
15 |     __tablename__ = 'crawl_tasks'
16 | 
17 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
18 |     subtask_id = Column(Integer, nullable=False, index=True)  # 所属子任务任务id
19 |     url_nested_list = Column(JSONB)  # [{"url_id": xxx, "url_address": xxx, 'url_options': {}}]
20 |     process_state = Column(String(30), server_default='readying')  # readying Started finished
21 |     failure_url_ids = Column(ARRAY(Integer), server_default='{}')  # 爬取失败url
22 |     finished_at = Column(TIMESTAMP)  # 完成时间
23 |     options = Column(JSONB)  # 爬取参数
24 | 
25 |     # success_count = Column(Integer)  # 爬取成功数
26 |     # failure_count = Column(Integer)  # 爬取失败数
27 | 
28 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
29 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
30 | 
31 |     def __repr__(self):
32 |         return f'<CrawlTask-{self.id}>'
33 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .crawl_task import CrawlTaskModelProxy
4 | from .result import ResultModelProxy
5 | 
6 | crawl_task_model_proxy = CrawlTaskModelProxy()
7 | result_model_proxy = ResultModelProxy()
8 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/crawl_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import CrawlTask
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class CrawlTaskModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = CrawlTask
11 | 
12 |     def create(self, **kwargs):
13 |         """
14 |         创建CrawlTask对象
15 |         """
16 | 
17 |         crawl_task_obj = CrawlTask(
18 |             subtask_id=kwargs['subtask_id'], url_nested_list=kwargs['url_nested_list'],
19 |             process_state=kwargs['process_state'], options=kwargs['options'])
20 |         self.db_session.add(crawl_task_obj)
21 |         self.safe_commit()
22 |         return crawl_task_obj
23 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/db_proxy/result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webs.api.models import Result
 4 | from webs.api.models.db_proxy.base import BaseModelProxy
 5 | 
 6 | 
 7 | class ResultModelProxy(BaseModelProxy):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model = Result
11 | 
12 |     def create(self, subtask_id, url_id, url_address, **kwargs):
13 |         """
14 |         保存爬取结果
15 |         :param subtask_id:
16 |         :param url_id:
17 |         :param url_address:
18 |         :param kwargs:
19 |         :return:
20 |         """
21 | 
22 |         result_obj = Result(
23 |             subtask_id=subtask_id, url_id=url_id, url_address=url_address,
24 |             http_code=kwargs.get('http_code'), title=kwargs.get('title'),
25 |             content=kwargs.get('content'), current_url=kwargs.get('current_url'),
26 |             har_uuid=kwargs.get('har_uuid'), screenshot_id=kwargs.get('screenshot_id'),
27 |             response_headers=kwargs.get('response_headers', {}), redirect_chain=kwargs.get('redirect_chain', []),
28 |             cookies=kwargs.get('cookies', [])
29 |         )
30 |         self.db_session.add(result_obj)
31 |         self.safe_commit()
32 |         return result_obj
33 | 
34 |     def query_already_crawl_url_ids(self, subtask_id):
35 |         """
36 |         查询已经抓取过的url
37 |         :param subtask_id:
38 |         :return:
39 |         """
40 | 
41 |         query = self.db_session.query(self.model.url_id).filter(self.model.subtask_id == subtask_id).all()
42 |         return [each[0] for each in query]
43 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/models/result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | 底层存储结果值 作为备份使用
 6 | """
 7 | 
 8 | from sqlalchemy import Column, BigInteger, String, TIMESTAMP, func, Integer, Text
 9 | from sqlalchemy.dialects.postgresql import JSONB
10 | 
11 | from webs.api.models import db
12 | 
13 | 
14 | class Result(db.Model):
15 |     __tablename__ = 'results'
16 | 
17 |     id = Column(BigInteger, primary_key=True, autoincrement=True)
18 |     subtask_id = Column(Integer, nullable=False, index=True)  # 所属子任务任务id
19 |     url_id = Column(Integer, nullable=False, index=True)  # url id
20 |     url_address = Column(String(1024), nullable=False)  # url 地址
21 |     http_code = Column(Integer)  # 网站状态码
22 |     title = Column(Text)  # 网站标题
23 |     content = Column(Text)  # 网站内容
24 |     current_url = Column(String(1024))  # 网站最后相应的地址
25 |     redirect_chain = Column(JSONB)  # 重定向链接
26 |     response_headers = Column(JSONB)  # response headers
27 |     har_uuid = Column(String(128))  # 网站交互过程存储文件
28 |     screenshot_id = Column(String(128))  # 截图Id
29 |     cookies = Column(JSONB)  # cookies
30 | 
31 |     create_time = Column(TIMESTAMP, server_default=func.now(), index=True)
32 |     update_time = Column(TIMESTAMP, server_default=func.now(), onupdate=func.now(), index=True)
33 | 
34 |     def __repr__(self):
35 |         return f'<Result-{self.id}>'
36 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/schemas/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from webs.api.exceptions.customs import InvalidAPIRequest
 5 | 
 6 | 
 7 | class LengthChecker(object):
 8 |     """字段长度校验"""
 9 | 
10 |     def __init__(self, sign, length):
11 |         self.sign = sign
12 |         self.length = length
13 | 
14 |     def __call__(self, verified):
15 |         if verified is not None and len(verified) > self.length:
16 |             raise InvalidAPIRequest(f'{self.sign}长度过长！')
17 | 
18 | 
19 | class OneOf(object):
20 |     """Validator which succeeds if ``value`` is a member of ``choices``"""
21 | 
22 |     def __init__(self, choices):
23 |         self.choices = choices
24 | 
25 |     def __call__(self, verified):
26 |         if verified not in self.choices:
27 |             raise InvalidAPIRequest(f'请选择{self.choices}其中之一！')
28 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/schemas/crawl_tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from webargs import fields
 4 | 
 5 | from webs.api.schemas import OneOf
 6 | 
 7 | create_crawl_task_schema = {
 8 |     'subtask_id': fields.Int(required=True),
 9 |     'url_nested_list': fields.DelimitedList(fields.Nested({
10 |         'url_id': fields.Int(required=True),
11 |         'url_address': fields.Str(required=True),
12 |         'url_options': fields.Dict(missing={})
13 |     }), required=True),
14 |     'options': fields.Nested({
15 |         'browser_type': fields.Str(missing='firefox', validate=OneOf(['chromium', 'firefox'])),
16 |         'priority': fields.Int(missing=3, validate=OneOf(choices=[1, 2, 3, 4, 5])),  # 任务优先级
17 |         'headless': fields.Bool(missing=False),  # 有头/无头模式 默认使用有头模式
18 |         'debug': fields.Bool(missing=False),  # 是否开启调试模式,
19 |         'referer': fields.Str(),  # 网站来路地址
20 |         'concurrency': fields.Int(missing=5, validate=OneOf(choices=[5, 10, 15, 20, 25, 30])),  # 并发数
21 |         'url_timeout': fields.Int(missing=30),  # 单个url超时时间
22 |         'enabled_render_js': fields.Bool(missing=True),
23 |         'page_wait_time': fields.Int(missing=3),  # 等待页面js渲染时间
24 |         'ignore_ssl': fields.Bool(missing=True),  # 是否忽略证书错误
25 |         'screenshot': fields.Bool(missing=False),  # 是否截图
26 |         'proxy_url': fields.Str(),  # 代理
27 |         'user_agent': fields.Str(),  # Ua
28 |         'record_har': fields.Bool(missing=False),  # 请求networks
29 |         'record_redirect': fields.Bool(missing=False),  # 是否记录重定向链接
30 |         'use_browser_cache': fields.Bool(missing=True),  # 是否使用浏览器缓存
31 |         'use_result_cache': fields.Bool(missing=True),  # 是否使用结果缓存
32 |         'wappalyzer': fields.Bool(missing=False),  # 是否使用指纹识别
33 |         'wait_until': fields.Str(
34 |             missing='load', validate=OneOf(choices=['domcontentloaded', 'load', 'networkidle'])),  # 控制页面何时加载成功
35 |         'rpc_server': fields.Str(required=True)
36 |     }, missing={})
37 | }
38 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/utils/loggers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import logging
  5 | import socket
  6 | import sys
  7 | import traceback
  8 | from datetime import datetime
  9 | 
 10 | try:
 11 |     import simplejson as json
 12 | except ImportError:
 13 |     import json
 14 | 
 15 | 
 16 | class JSONFormatter(logging.Formatter):
 17 |     """
 18 |     JSON formatter for python logging
 19 | 
 20 |     You can pass additional tags on a per message basis using the
 21 |     key "tags" in the extra parameter.
 22 |     eg: logger.error('hello world!', extra={"tags": ["hello=world"]})
 23 |     """
 24 | 
 25 |     def __init__(self, tags=None, hostname=None, fqdn=False, message_type='JSON',
 26 |                  indent=None):
 27 |         """
 28 |         :param tags: a list of tags to add to every messages
 29 |         :hostname: force a specific hostname
 30 |         :fqdn: a boolean to use the FQDN instead of the machine's hostname
 31 |         :message_type: the message type for Logstash formatters
 32 |         :indent: indent level of the JSON output
 33 |         """
 34 |         self.message_type = message_type
 35 |         self.tags = tags if tags is not None else []
 36 |         self.extra_tags = []
 37 |         self.indent = indent
 38 | 
 39 |         if hostname:
 40 |             self.host = hostname
 41 |         elif fqdn:
 42 |             self.host = socket.getfqdn()
 43 |         else:
 44 |             self.host = socket.gethostname()
 45 | 
 46 |     def get_extra_fields(self, record):
 47 |         # The list contains all the attributes listed in
 48 |         # http://docs.python.org/library/logging.html#logrecord-attributes
 49 |         skip_list = [
 50 |             'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args',
 51 |             'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg',
 52 |             'msecs', 'msecs', 'message', 'name', 'pathname', 'process',
 53 |             'processName', 'relativeCreated', 'thread', 'threadName', 'extra']
 54 | 
 55 |         if sys.version_info < (3, 0):
 56 |             easy_types = (str, bool, dict, float, int, list, type(None))
 57 |         else:
 58 |             easy_types = (str, bool, dict, float, int, list, type(None))
 59 | 
 60 |         fields = {}
 61 | 
 62 |         self.extra_tags = []
 63 |         for key, value in record.__dict__.items():
 64 |             if key not in skip_list:
 65 |                 if key == 'tags' and isinstance(value, list):
 66 |                     self.extra_tags = value
 67 |                 elif isinstance(value, easy_types):
 68 |                     fields[key] = value if value else "null"
 69 |                 else:
 70 |                     fields[key] = repr(value)
 71 | 
 72 |         return fields
 73 | 
 74 |     def get_debug_fields(self, record):
 75 |         if record.exc_info:
 76 |             exc_info = self.format_exception(record.exc_info)
 77 |         else:
 78 |             exc_info = record.exc_text
 79 |         return {
 80 |             'exc_info': exc_info,
 81 |             'filename': record.filename,
 82 |             'lineno': record.lineno,
 83 |         }
 84 | 
 85 |     @classmethod
 86 |     def format_source(cls, message_type, host, path):
 87 |         return "%s://%s/%s" % (message_type, host, path)
 88 | 
 89 |     @classmethod
 90 |     def format_timestamp(cls, time):
 91 |         return str(datetime.fromtimestamp(time).strftime("%Y-%m-%d %X"))
 92 | 
 93 |     @classmethod
 94 |     def format_exception(cls, exc_info):
 95 |         return ''.join(traceback.format_exception(*exc_info)) if exc_info else ''
 96 | 
 97 |     @classmethod
 98 |     def serialize(cls, message, indent=None):
 99 |         return json.dumps(message, ensure_ascii=False, indent=indent)
100 | 
101 |     def format(self, record, serialize=True):
102 |         old_message = record.getMessage()
103 |         try:
104 |             new_message = json.loads(old_message)
105 |         except json.decoder.JSONDecodeError as e:
106 |             message = old_message.replace("'", '"')
107 |             new_message = json.loads(message)
108 |         except Exception:
109 |             new_message = record.getMessage()
110 |         # Create message dict
111 |         message = {
112 |             'timestamp': self.format_timestamp(record.created),
113 |             'app': os.environ.get('APP_NAME'),
114 |             'host': self.host,
115 |             'environment': os.environ.get('FLASK_ENV'),
116 |             'logger': record.name,
117 |             'level': record.levelname,
118 |             'messages': new_message,
119 |             'path': record.pathname,
120 |             'tags': self.tags[:]
121 |         }
122 | 
123 |         # Add extra fields
124 |         message.update(self.get_extra_fields(record))
125 | 
126 |         # Add extra tags
127 |         if self.extra_tags:
128 |             message['tags'].extend(self.extra_tags)
129 | 
130 |         # If exception, add debug info
131 |         if record.exc_info or record.exc_text:
132 |             message.update(self.get_debug_fields(record))
133 | 
134 |         if serialize:
135 |             return self.serialize(message, indent=self.indent)
136 |         return message
137 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/utils/requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from datetime import datetime
  4 | from flask import current_app, request
  5 | from sqlalchemy.exc import DatabaseError
  6 | from webs.api.exceptions.customs import InvalidContentType
  7 | from webs.api.models import db
  8 | 
  9 | ACL_ORIGIN = 'Access-Control-Allow-Origin'
 10 | ACL_METHODS = 'Access-Control-Allow-Methods'
 11 | ACL_ALLOWED_HEADERS = 'Access-Control-Allow-Headers'
 12 | ACL_CREDENTIALS = 'Access-Control-Allow-Credentials'
 13 | ACL_CACHE_CONTROL = 'Cache-Control'
 14 | 
 15 | GET_METHOD = 'GET'
 16 | OPTIONS_METHOD = 'OPTIONS'
 17 | ALLOWED_ORIGINS = '*'
 18 | ALLOWED_METHODS = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
 19 | ALLOWED_HEADERS = 'Authorization, DNT, X-CustomHeader, Keep-Alive, User-Agent, ' \
 20 |                   'X-Requested-With, If-Modified-Since, Cache-Control, Content-Type'
 21 | ALLOWED_CREDENTIALS = 'true'  # Allow send cookie
 22 | ALLOWED_CACHE_CONTROL = 'no-cache, no-store, must-revalidate'
 23 | 
 24 | 
 25 | def before_request_middleware(app):
 26 |     app.before_request_funcs.setdefault(None, [
 27 |         ensure_request_log,
 28 |         ensure_content_type,
 29 |     ])
 30 | 
 31 | 
 32 | def after_request_middleware(app):
 33 |     app.after_request_funcs.setdefault(None, [
 34 |         enable_cors,
 35 |         commit_session,
 36 |     ])
 37 | 
 38 | 
 39 | def teardown_appcontext_middleware(app):
 40 |     app.teardown_appcontext_funcs = [
 41 |         shutdown_session,
 42 |     ]
 43 | 
 44 | 
 45 | def ensure_request_log():
 46 |     """当为生产环境时，屏蔽中间件日志记录器"""
 47 |     if current_app.debug:
 48 |         current_app.logger.info(
 49 |             "Request Time: {time} || Request Client IP: {client} || Full Path: {path} || "
 50 |             "Parameters: {param}".format(
 51 |                 time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 52 |                 client=request.environ.get('HTTP_X_REAL_IP', request.remote_addr),
 53 |                 path=request.full_path,
 54 |                 param=request.data.decode('utf-8')))
 55 | 
 56 | 
 57 | def ensure_content_type():
 58 |     """
 59 |     Ensures that the Content-Type for all requests
 60 |     is `application-json` or `multipart/form-data`, otherwise appropriate error
 61 |     is raised.
 62 |     :raises: InvalidContentType if Content-Type is not `application-json`
 63 |     or not `multipart/form-data`
 64 |     """
 65 | 
 66 |     content_type = request.headers.get('Content-Type')
 67 |     if request.method != GET_METHOD and request.method != OPTIONS_METHOD and \
 68 |             (not content_type or not ('application/json' in content_type or
 69 |                                       'multipart/form-data' in content_type)):
 70 |         raise InvalidContentType(
 71 |             message='Invalid Content-Type. '
 72 |                     'Only `application/json` or `multipart/form-data` is allowed')
 73 | 
 74 | 
 75 | def enable_cors(response):
 76 |     """
 77 |     Enable Cross-origin resource sharing.
 78 |     These headers are needed for the clients that
 79 |     will consume the API via AJAX requests.
 80 |     """
 81 |     if request.method == OPTIONS_METHOD:
 82 |         response = current_app.make_default_options_response()
 83 |     response.headers[ACL_ORIGIN] = ALLOWED_ORIGINS
 84 |     response.headers[ACL_METHODS] = ALLOWED_METHODS
 85 |     response.headers[ACL_ALLOWED_HEADERS] = ALLOWED_HEADERS
 86 |     response.headers[ACL_CACHE_CONTROL] = ACL_CACHE_CONTROL
 87 | 
 88 |     return response
 89 | 
 90 | 
 91 | def commit_session(response):
 92 |     """
 93 |     Try to commit the db session in the case
 94 |     of a successful request with status_code
 95 |     under 400.
 96 |     """
 97 |     if response.status_code >= 400:
 98 |         return response
 99 |     try:
100 |         db.session.commit()
101 |     except DatabaseError:
102 |         db.session.rollback()
103 |     return response
104 | 
105 | 
106 | def shutdown_session(exception=None):
107 |     """
108 |     Remove the db session and detach from the
109 |     database driver after application shutdown.
110 |     """
111 |     db.session.remove()
112 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/utils/routers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pkgutil
 4 | 
 5 | 
 6 | def register_routes(app):
 7 |     """Register routes."""
 8 |     from .. import views
 9 |     from flask.blueprints import Blueprint
10 | 
11 |     for _, name, _ in pkgutil.iter_modules(views.__path__, prefix=views.__name__ + "."):
12 |         blueprint_name = name.split('.')[-1]
13 |         modules = __import__(name, fromlist="dummy")
14 |         blueprint = getattr(modules, blueprint_name)
15 |         if isinstance(blueprint, Blueprint):
16 |             app.register_blueprint(blueprint)
17 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/utils/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask_migrate import Migrate
 4 | 
 5 | from webs.api.models import db, redis_store
 6 | 
 7 | 
 8 | def init_db(app):
 9 |     """
10 |     Create database if doesn't exist and
11 |     create all tables.
12 |     """
13 | 
14 |     # 初始化pg
15 |     db.init_app(app)
16 |     migrate = Migrate(compare_type=True, compare_server_default=True)
17 |     migrate.init_app(app, db)
18 | 
19 |     # 初始化Redis
20 |     redis_store.init_app(app)
21 | 
22 |     return db
23 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/views/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/views/crawl_tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask import Blueprint, jsonify
 4 | from webargs.flaskparser import use_args
 5 | 
 6 | from webs.api.bizs.crawl_task import CrawlTaskBiz
 7 | from webs.api.schemas.crawl_tasks import create_crawl_task_schema
 8 | 
 9 | crawl_tasks = Blueprint('crawl_tasks', __name__, url_prefix='/crawl_tasks')
10 | 
11 | 
12 | @crawl_tasks.route('', methods=['POST'])
13 | @use_args(create_crawl_task_schema, locations=('json',))
14 | def create_crawl_task(args):
15 |     """
16 |     创建爬虫任务
17 |     :param args:
18 |     :return:
19 |     """
20 |     crawl_task_biz = CrawlTaskBiz()
21 |     data = crawl_task_biz.create_crawl_task(**args)
22 | 
23 |     return jsonify({
24 |         'status': True,
25 |         'data': data
26 |     }), 201
27 | 


--------------------------------------------------------------------------------
/services/spider/webs/api/views/ping.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from flask import Blueprint, jsonify
 4 | 
 5 | ping = Blueprint('ping', __name__)
 6 | 
 7 | 
 8 | @ping.route('/ping', methods=['GET'])
 9 | def ping_pong():
10 |     """
11 |     测试服务是否可用
12 |     """
13 |     return jsonify({
14 |         "data": "pong",
15 |         "status": True
16 |     })
17 | 


--------------------------------------------------------------------------------
/services/spider/webs/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | class BaseConfig:
 7 |     """Base configuration"""
 8 | 
 9 |     # Root path of project
10 |     PROJECT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
11 | 
12 |     DEBUG = True
13 |     TESTING = False
14 |     SQLALCHEMY_TRACK_MODIFICATIONS = False
15 |     SQLALCHEMY_ENGINE_OPTIONS = {'pool_pre_ping': True}
16 |     SECRET_KEY = os.environ.get('SECRET_KEY')
17 | 
18 |     # Redis configuration
19 |     REDIS_URL = os.environ.get('REDIS_URL')
20 | 
21 | 
22 | class DevelopmentConfig(BaseConfig):
23 |     """Development configuration"""
24 | 
25 |     SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
26 | 
27 | 
28 | class ProductionConfig(BaseConfig):
29 |     """Production configuration"""
30 | 
31 |     DEBUG = False
32 |     SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL')
33 | 


--------------------------------------------------------------------------------
/services/spider/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | from celery import Celery
 5 | 
 6 | ##################
 7 | # Celery配置
 8 | from kombu import Queue
 9 | 
10 | from webs import create_app
11 | 
12 | 
13 | class CeleryConfig(object):
14 |     # 任务与劣化为json，从Celery4.0开始，默认序列化器将为json
15 |     task_serializer = 'json'
16 | 
17 |     # 结果序列化为json
18 |     result_serializer = 'json'
19 | 
20 |     # 定时任务过期时间
21 |     result_expires = 60 * 60 * 24
22 | 
23 |     # 关闭worker事件监听 防止队列溢出
24 |     worker_send_task_events = False
25 | 
26 |     # 允许接收的任务类型
27 |     accept_content = ["json"]
28 | 
29 |     # 每个进程预取任务数，启动参数进行覆盖设置，此处仅作为标记使用
30 |     worker_prefetch_multiplier = 4
31 | 
32 |     # 每个worker执行1个任务就销毁重启，启动参数进行覆盖设置，此处仅作为标记使用
33 |     worker_max_tasks_per_child = 1
34 | 
35 |     # 时区设置
36 |     timezone = 'Asia/Shanghai'
37 |     enable_utc = True
38 | 
39 | 
40 | ##################
41 | # 初始化celery worker
42 | def init_celery(app=None, celery_type='usual'):
43 |     app = app or create_app()
44 |     celery_app = Celery(__name__, broker=os.environ.get('CRAWL_CELERY_BROKER_URL'))
45 |     celery_app.config_from_object(CeleryConfig)
46 | 
47 |     # 导入相关任务模块
48 |     if celery_type == 'usual':
49 |         celery_app.conf.update(imports=['worker.fetch', 'worker.results'])
50 |         celery_app.conf.task_queues = (
51 |             Queue("priority_fetch", queue_arguments={'x-max-priority': 5}),
52 |             Queue("results"),
53 |         )
54 |     elif celery_type == 'beat':
55 |         pass
56 |         # celery_app.conf.update(
57 |         #     imports=['project.api.tasks.cron', 'project.api.tasks.event_cron', 'project.api.tasks.visual_cron'])
58 |         # celery_app.conf.update(
59 |         #     CELERYBEAT_SCHEDULE={
60 |         #     }
61 |         # )
62 | 
63 |     # 在flask上下文中执行
64 |     class ContextTask(celery_app.Task):
65 |         """Make celery tasks work with Flask app context"""
66 | 
67 |         def __call__(self, *args, **kwargs):
68 |             with app.app_context():
69 |                 return self.run(*args, **kwargs)
70 | 
71 |     celery_app.Task = ContextTask
72 |     return celery_app
73 | 
74 | 
75 | celery_app = init_celery()
76 | # beat_app = init_celery(celery_type='beat')
77 | 


--------------------------------------------------------------------------------
/services/spider/worker/fetch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | from billiard.exceptions import SoftTimeLimitExceeded
 6 | 
 7 | from rpc.client.result import ResultClient
 8 | from webs.api.models.db_proxy import crawl_task_model_proxy, result_model_proxy
 9 | from worker import celery_app
10 | from worker.library.playwright import PlayWrightHandler
11 | 
12 | 
13 | @celery_app.task(
14 |     name='fetch_tasks', queue='priority_fetch', acks_late=True, soft_time_limit=1000, max_retries=1,
15 |     default_retry_delay=30, autoretry_for=(Exception,))
16 | def fetch_tasks(crawl_task_id):
17 |     """
18 |     通过优先级队列取得任务进行抓取
19 |     """
20 | 
21 |     crawl_task_obj = crawl_task_model_proxy.find_one_with_condition(
22 |         crawl_task_model_proxy.model.id == crawl_task_id,
23 |         crawl_task_model_proxy.model.process_state != 'finished'
24 |     )
25 |     if not crawl_task_obj:
26 |         return
27 | 
28 |     # 设置爬取任务开始
29 |     if crawl_task_obj.process_state == 'readying':
30 |         crawl_task_model_proxy.set_attr(crawl_task_obj, 'process_state', 'running')
31 |         url_nested_list = crawl_task_obj.url_nested_list
32 | 
33 |     # 导致此情况原因为worker进程异常退出，rabbitmq未确认此消息，worker重启此任务再次被投递
34 |     else:  # crawl_task_obj.process_state == 'running'
35 |         already_url_ids = result_model_proxy.query_already_crawl_url_ids(subtask_id=crawl_task_obj.subtask_id)
36 |         url_nested_list = [
37 |             url_info for url_info in crawl_task_obj.url_nested_list
38 |             if url_info['url_id'] not in already_url_ids
39 |         ]
40 |     undone_url_ids = []
41 |     if url_nested_list:
42 |         # 执行抓取
43 |         playwright_handler = PlayWrightHandler(
44 |             subtask_id=crawl_task_obj.subtask_id,
45 |             url_nested_list=url_nested_list,
46 |             options=crawl_task_obj.options)
47 |         undone_url_ids = playwright_handler.run()
48 | 
49 |     # 设置爬取状态、结束时间、抓取失败的urls
50 |     crawl_task_model_proxy.set_many_attr(
51 |         obj=crawl_task_obj,
52 |         fields_v={
53 |             'process_state': 'finished',
54 |             'finished_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
55 |             'failure_url_ids': undone_url_ids
56 |         }
57 |     )
58 | 
59 |     ####### 调用engine端rpc服务设置subtask爬取状态
60 |     # 连接grpc服务
61 |     grpc_result_client = ResultClient(crawl_task_obj.options.get('rpc_server'))
62 | 
63 |     # 设置Subtask爬取状态
64 |     grpc_result_client.set_subtask_status(
65 |         subtask_id=crawl_task_obj.subtask_id, status=True, finished_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
66 | 


--------------------------------------------------------------------------------
/services/spider/worker/library/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/services/spider/worker/library/helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | from typing import Optional
 5 | 
 6 | if sys.version_info >= (3, 8):
 7 |     from typing import TypedDict  # pylint: disable=no-name-in-module
 8 | else:
 9 |     from typing_extensions import TypedDict
10 | 
11 | 
12 | class ProxyServer(TypedDict):
13 |     server: str
14 |     bypass: Optional[str]
15 |     username: Optional[str]
16 |     password: Optional[str]
17 | 
18 | 
19 | class RecordHarOptions(TypedDict):
20 |     omitContent: Optional[bool]
21 |     path: str
22 | 


--------------------------------------------------------------------------------
/services/spider/worker/results.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import hashlib
 3 | import json
 4 | import os
 5 | import time
 6 | 
 7 | from rpc.client.result import ResultClient
 8 | from webs.api.models.db_proxy import result_model_proxy
 9 | from worker import celery_app
10 | 
11 | 
12 | @celery_app.task(name='save_results', queue='results')
13 | def save_results(subtask_id, url_id, url_address, rpc_server, **kwargs):
14 |     """
15 |     保存爬取结果
16 |     :param subtask_id:
17 |     :param url_id:
18 |     :param url_address:
19 |     :param rpc_server:
20 |     :param kwargs:
21 |     :return:
22 |     """
23 | 
24 |     http_archive_dict = kwargs.pop('http_archive_dict')
25 | 
26 |     # 保存爬取结果，仅作为容灾备份使用
27 |     result_model_proxy.create(subtask_id, url_id, url_address)
28 | 
29 |     # 连接grpc服务
30 |     grpc_result_client = ResultClient(rpc_server)
31 | 
32 |     # 反馈截图
33 |     if kwargs.get('screenshot_id') \
34 |             and os.path.exists('/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id'])):
35 |         img_path = '/usr/src/app/screenshots/{}.png'.format(kwargs['screenshot_id'])
36 |         try:
37 |             with open(img_path, 'rb') as f:
38 |                 md5 = hashlib.md5()
39 |                 while True:
40 |                     fb = f.read(8096)
41 |                     if not fb:
42 |                         break
43 |                     md5.update(fb)
44 |                 screenshot_md5 = md5.hexdigest()
45 |             os.rename(img_path, f'/usr/src/app/screenshots/{screenshot_md5}.png')
46 |             kwargs['screenshot_id'] = screenshot_md5
47 |             grpc_result_client.upload_screenshot(screenshot_name=f'{screenshot_md5}.png')
48 |         except Exception as e:
49 |             pass
50 | 
51 |     # 向engine反馈基本爬取数据
52 |     grpc_result_client.save_base_result(subtask_id, url_id, url_address, **kwargs)
53 | 
54 |     # 反馈har文件
55 |     if kwargs.get('har_uuid') and http_archive_dict.get('hars'):
56 |         with open('/usr/src/app/hars/{}.json'.format(kwargs['har_uuid']), 'w+', encoding='utf-8') as f:
57 |             f.write(json.dumps(http_archive_dict, ensure_ascii=False, indent=2))
58 |         grpc_result_client.upload_har_file(har_file_name='{}.json'.format(kwargs['har_uuid']))
59 | 


--------------------------------------------------------------------------------
/spider.docker-conpose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   spider-client:
 5 |     container_name: spider-client
 6 |     build:
 7 |       context: ./services/spider
 8 |       dockerfile: Dockerfile-dev
 9 |     volumes:
10 |       - './services/spider:/usr/src/app'
11 |     ports:
12 |       - 15001:5000
13 |     env_file:
14 |       - spider.env
15 |     environment:
16 |       - FLASK_APP=webs:create_app
17 |       - ENDPOINT=web
18 |     restart: always
19 | 
20 |   spider-fetch:
21 |     container_name: spider-fetch
22 |     build:
23 |       context: ./services/spider
24 |       dockerfile: Dockerfile-dev
25 |     volumes:
26 |       - './services/spider:/usr/src/app'
27 |     env_file:
28 |       - spider.env
29 |     environment:
30 |       - ENDPOINT=fetch
31 |     restart: always
32 | 
33 | 
34 |   save-results:
35 |     container_name: save-results
36 |     build:
37 |       context: ./services/spider
38 |       dockerfile: Dockerfile-dev
39 |     volumes:
40 |       - './services/spider:/usr/src/app'
41 |     env_file:
42 |       - spider.env
43 |     environment:
44 |       - ENDPOINT=results
45 |     restart: always


--------------------------------------------------------------------------------
/架构图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/who0sy/crawloop/b9fcc21f7ec712a74cb5952686c1f4cce896207e/架构图.png


--------------------------------------------------------------------------------