├── images
├── 1.png
├── 2.png
├── 3.png
├── 4.png
├── 5.png
├── d1.png
├── d2.png
├── d3.png
├── d4.png
└── d5.png
├── watchdogs
├── __main__.py
├── static
│ ├── css
│ │ ├── fonts
│ │ │ └── element-icons.woff
│ │ ├── watchdogs.min.css
│ │ └── watchdogs.css
│ ├── img
│ │ └── favicon.svg
│ └── js
│ │ ├── clipboard.min.js
│ │ ├── watchdogs.min.js
│ │ └── vue-resource.min.js
├── __init__.py
├── background.py
├── templates
│ ├── auth.html
│ ├── logs.html
│ ├── groups.html
│ ├── feeds.html
│ ├── lite.html
│ └── index.html
├── main.py
├── callbacks.py
├── config.py
├── utils.py
├── settings.py
├── crawler.py
├── models.py
└── app.py
├── run_server.py
├── requirements.txt
├── LICENSE
├── .gitignore
├── setup.py
├── quick_start.md
└── README.md
/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/1.png
--------------------------------------------------------------------------------
/images/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/2.png
--------------------------------------------------------------------------------
/images/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/3.png
--------------------------------------------------------------------------------
/images/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/4.png
--------------------------------------------------------------------------------
/images/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/5.png
--------------------------------------------------------------------------------
/images/d1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d1.png
--------------------------------------------------------------------------------
/images/d2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d2.png
--------------------------------------------------------------------------------
/images/d3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d3.png
--------------------------------------------------------------------------------
/images/d4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d4.png
--------------------------------------------------------------------------------
/images/d5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d5.png
--------------------------------------------------------------------------------
/watchdogs/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
--------------------------------------------------------------------------------
/watchdogs/static/css/fonts/element-icons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/watchdogs/static/css/fonts/element-icons.woff
--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
1 | from watchdogs.__main__ import main
2 |
3 | if __name__ == "__main__":
4 | # 1. pip install watchdogs
5 | # 2. python -m watchdogs
6 | main()
7 |
--------------------------------------------------------------------------------
/watchdogs/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from .config import Config
4 | from .main import init_app
5 |
6 | __version__ = '2.0.1'
7 | __all__ = ['Config', 'init_app']
8 | logging.getLogger('watchdogs').addHandler(logging.NullHandler())
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles
2 | async_lru
3 | beautifulsoup4
4 | SQLAlchemy==1.4.41
5 | databases>=0.5.5
6 | pydantic<2.0.0
7 | fastapi
8 | fire
9 | jinja2
10 | jmespath
11 | jsonpath-rw-ext
12 | lxml
13 | objectpath
14 | pyyaml>=5.3
15 | selectolax
16 | toml
17 | torequests>=5.0.4
18 | uniparser>=3.0.2
19 | uvicorn
20 |
--------------------------------------------------------------------------------
/watchdogs/static/css/watchdogs.min.css:
--------------------------------------------------------------------------------
1 | .full-screen,body,.el-tabs__content,.el-tabs__content > *{width:100%;height:100%;}html{margin:0 auto;zoom:90%;width:90%;height:90%;}.el-tabs__item{font-weight:bold;}.el-message-box--center{min-width:50%;}.el-message-box{width:auto;}.time-td{min-width:16em;padding-left:3em;}#input_host_form > .el-form-item:first-child .el-form-item__content,#input_host_form > .el-form-item:first-child{width:100%;}[aria-label='Edit Crawler JSON'] .el-textarea__inner{height:10em;}.el-table_1_column_8 > .cell{white-space:nowrap;}div.foot{display:flex;justify-content:center;}.host-tag{margin:0.5em;cursor:pointer;}.el-table .warning-row{background:oldlace;}.cb_name{cursor:pointer;padding-left:1em;}p.custom_links{text-align:center;color:black;background-color:rgba(223,223,223,0.5);padding:0.5em 0 0.5em 0;box-shadow:3px 3px 5px #888888;}.request_args_pre{font-size:0.9em;}[v-cloak]{display:none;}.el-popover{max-width:50%;}.el-message-box.work_hours_doc{width:40%;}pre{word-wrap:break-word;white-space:pre-wrap;}
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 ClericPy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/watchdogs/background.py:
--------------------------------------------------------------------------------
1 | from asyncio import ensure_future, sleep
2 | from .utils import check_work_time, solo, try_catch
3 | from .config import Config
4 |
5 |
6 | async def crawl_chunks(crawl_once):
7 | loop_num = 0
8 | while not Config.is_shutdown:
9 | has_more = await crawl_once()
10 | if isinstance(has_more, Exception):
11 | Config.logger.error(f'crawl_once error, {has_more!r}')
12 | break
13 | Config.logger.info(
14 | f'crawl_once finished, has_more: {has_more}, loop: {loop_num}')
15 | if not has_more:
16 | break
17 | loop_num += 1
18 |
19 |
20 | async def background_loop(coro_funcs: list = None):
21 | while not Config.is_shutdown:
22 | # non-block running, and be constrained by SoloLock class
23 | for func in coro_funcs:
24 | if func.__name__ == 'crawl_once':
25 | ensure_future(try_catch(crawl_chunks, func))
26 | else:
27 | ensure_future(try_catch(func))
28 | await sleep(Config.check_interval)
29 |
30 |
31 | async def db_backup_handler():
32 | logger = Config.logger
33 | if check_work_time(Config.db_backup_time):
34 | logger.warning(f'Backup DB start: {Config.db_backup_time}.')
35 | # may raise solo error
36 | with solo:
37 | result = await try_catch(Config.db_backup_function)
38 | logger.info(f'Backup DB finished: {result!r}')
39 |
--------------------------------------------------------------------------------
/watchdogs/static/css/watchdogs.css:
--------------------------------------------------------------------------------
1 | .full-screen,
2 | body,
3 | .el-tabs__content,
4 | .el-tabs__content > * {
5 | width: 100%;
6 | height: 100%;
7 | }
8 |
9 | html {
10 | margin: 0 auto;
11 | zoom: 90%;
12 | width: 90%;
13 | height: 90%;
14 | }
15 |
16 | .el-tabs__item {
17 | font-weight: bold;
18 | }
19 |
20 | .el-message-box--center {
21 | min-width: 50%;
22 | }
23 |
24 | .el-message-box {
25 | width: auto;
26 | }
27 |
28 | .time-td {
29 | min-width: 16em;
30 | padding-left: 3em;
31 | }
32 |
33 | #input_host_form > .el-form-item:first-child .el-form-item__content,
34 | #input_host_form > .el-form-item:first-child {
35 | width: 100%;
36 | }
37 |
38 | [aria-label='Edit Crawler JSON'] .el-textarea__inner {
39 | height: 10em;
40 | }
41 |
42 | .el-table_1_column_8 > .cell {
43 | white-space: nowrap;
44 | }
45 |
46 | div.foot {
47 | display: flex;
48 | justify-content: center;
49 | }
50 |
51 | .host-tag {
52 | margin: 0.5em;
53 | cursor: pointer;
54 | }
55 |
56 | .el-table .warning-row {
57 | background: oldlace;
58 | }
59 |
60 | .cb_name {
61 | cursor: pointer;
62 | padding-left: 1em;
63 | }
64 |
65 | p.custom_links {
66 | text-align: center;
67 | color: black;
68 | background-color: rgba(223, 223, 223, 0.5);
69 | padding: 0.5em 0 0.5em 0;
70 | box-shadow: 3px 3px 5px #888888;
71 | }
72 |
73 | .request_args_pre {
74 | font-size: 0.9em;
75 | }
76 |
77 | [v-cloak] {
78 | display: none;
79 | }
80 | .el-popover {
81 | max-width: 50%;
82 | }
83 | .el-message-box.work_hours_doc{
84 | width: 40%;
85 | }
86 | pre {
87 | word-wrap: break-word;
88 | white-space: pre-wrap;
89 | }
90 |
--------------------------------------------------------------------------------
/watchdogs/templates/auth.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | {{action}} Watchdogs v{{version}}
10 |
11 |
19 |
20 |
21 |
22 |
23 | Login
24 |
25 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | tmp.py
131 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import re
4 | import sys
5 |
6 | from setuptools import find_packages, setup
7 | """
8 | linux:
9 | rm -rf "dist/*";rm -rf "build/*";python3 setup.py bdist_wheel;twine upload "dist/*;rm -rf "dist/*";rm -rf "build/*""
10 | win32:
11 | rm -r -Force dist;rm -r -Force build;python3 setup.py bdist_wheel;twine upload "dist/*";rm -r -Force dist;rm -r -Force build;rm -r -Force watchdogs.egg-info
12 | """
13 |
14 | py_version = sys.version_info
15 | if py_version.major < 3 or py_version.minor < 6:
16 | raise RuntimeError('Only support python3.6+')
17 |
18 | with open('requirements.txt') as f:
19 | install_requires = [line for line in f.read().strip().split('\n')]
20 |
21 | with open("README.md", encoding="u8") as f:
22 | long_description = f.read()
23 |
24 | if not re.search(r'postgresql|mysql|sqlite', str(sys.argv)):
25 | install_requires.append('aiosqlite')
26 |
27 | here = os.path.abspath(os.path.dirname(__file__))
28 | with open(os.path.join(here, 'watchdogs', '__init__.py'), encoding="u8") as f:
29 | matched = re.search(r'''__version__ = ['"](.*?)['"]''', f.read())
30 | if not matched:
31 | raise ValueError('Not find the __version__ info.')
32 | version = matched.group(1)
33 |
34 | description = "Watchdogs to keep an eye on the world's change. Read more: https://github.com/ClericPy/watchdogs."
35 |
36 | setup(
37 | name="watchdogs",
38 | version=version,
39 | keywords="requests crawler uniparser torequests fastapi watchdog",
40 | description=description,
41 | long_description=long_description,
42 | long_description_content_type='text/markdown',
43 | license="MIT License",
44 | install_requires=install_requires,
45 | py_modules=["watchdogs"],
46 | package_data={
47 | 'watchdogs': [
48 | 'templates/*.html', 'static/img/*.*', 'static/js/*.js',
49 | 'static/css/*.css', 'static/css/fonts/*.*'
50 | ]
51 | },
52 | extras_require={
53 | "postgresql": ["asyncpg", "psycopg2-binary"],
54 | "mysql": ["aiomysql", "pymysql"],
55 | "sqlite": ["aiosqlite"]
56 | },
57 | classifiers=[
58 | "License :: OSI Approved :: MIT License",
59 | 'Programming Language :: Python',
60 | "Programming Language :: Python :: 3",
61 | "Programming Language :: Python :: 3.6",
62 | "Programming Language :: Python :: 3.7",
63 | "Programming Language :: Python :: 3.8",
64 | ],
65 | author="ClericPy",
66 | author_email="clericpy@gmail.com",
67 | url="https://github.com/ClericPy/watchdogs",
68 | packages=find_packages(),
69 | platforms="any",
70 | )
71 |
--------------------------------------------------------------------------------
/watchdogs/templates/logs.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
12 | Watchdogs logs
13 |
14 |
15 |
16 |
56 |
57 |
58 |
59 |
83 | {% for item in items %}
84 |
85 |
86 | {{item['name']}}.log
87 | {{item['line_no']}} lines ( {{item['file_size']}} /
89 | {{item['file_size_mb']}} MB ), st_mtime: {{item['st_mtime']}}
91 |
95 | Clear
97 |
98 |
99 | {{item['log_text']}}
100 | {% endfor %}
101 |
102 |
103 |
--------------------------------------------------------------------------------
/watchdogs/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from fire import Fire
4 | from uvicorn import run
5 |
6 | from .config import NotSet, ensure_dir
7 | from .settings import Config, get_valid_value, setup
8 |
9 |
10 | def clear_dir(dir_path):
11 | if not dir_path.is_dir():
12 | print(f'Dir is not exist: {dir_path}.')
13 | return True
14 | print(f'Cleaning {dir_path}...')
15 | for f in dir_path.iterdir():
16 | if f.is_dir():
17 | clear_dir(f)
18 | else:
19 | f.unlink()
20 | print(f'File removed: {f}')
21 | dir_path.rmdir()
22 | print(f'Folder removed: {dir_path}')
23 |
24 |
25 | def init_app(db_url=None,
26 | password=None,
27 | uninstall=False,
28 | mute_std_log=NotSet,
29 | mute_file_log=NotSet,
30 | md5_salt=None,
31 | config_dir=None,
32 | use_default_cdn=False,
33 | allow_new_request=False,
34 | **uvicorn_kwargs):
35 | if config_dir:
36 | Config.CONFIG_DIR = ensure_dir(config_dir)
37 | if uninstall:
38 | clear_dir(Config.CONFIG_DIR)
39 | sys.exit('Config dir cleared.')
40 | if allow_new_request:
41 | # will allow use requests / aiohttp / tPool / Requests in UDFParser
42 | import aiohttp
43 | import requests
44 | from torequests.dummy import Requests
45 | from torequests.main import tPool
46 | from uniparser.parsers import UDFParser
47 |
48 | UDFParser._GLOBALS_ARGS.update(aiohttp=aiohttp,
49 | requests=requests,
50 | Requests=Requests,
51 | tPool=tPool)
52 | # backward compatibility for ignore_stdout_log & ignore_file_log
53 | Config.mute_std_log = get_valid_value(
54 | [uvicorn_kwargs.pop('ignore_stdout_log', NotSet), mute_std_log],
55 | Config.mute_std_log)
56 | Config.mute_file_log = get_valid_value(
57 | [uvicorn_kwargs.pop('ignore_file_log', NotSet), mute_file_log],
58 | Config.mute_file_log)
59 | # update by given uvicorn_kwargs
60 | Config.uvicorn_kwargs.update(uvicorn_kwargs)
61 | if db_url:
62 | # update by given db_url
63 | Config.db_url = db_url
64 | Config.password = password
65 | Config.md5_salt = md5_salt or ''
66 | setup(use_default_cdn=use_default_cdn)
67 | from .app import app
68 | return app
69 |
70 |
71 | def start_app(db_url=None,
72 | password=None,
73 | uninstall=False,
74 | mute_std_log=NotSet,
75 | mute_file_log=NotSet,
76 | md5_salt=None,
77 | config_dir=None,
78 | use_default_cdn=False,
79 | allow_new_request=False,
80 | **uvicorn_kwargs):
81 | app = init_app(db_url=db_url,
82 | password=password,
83 | uninstall=uninstall,
84 | mute_std_log=mute_std_log,
85 | mute_file_log=mute_file_log,
86 | md5_salt=md5_salt,
87 | config_dir=config_dir,
88 | use_default_cdn=use_default_cdn,
89 | allow_new_request=allow_new_request,
90 | **uvicorn_kwargs)
91 |
92 | run(app, **Config.uvicorn_kwargs)
93 |
94 |
95 | def main():
96 | argv = sys.argv
97 | if ('-h' in argv or '--help' in argv) and '--' not in argv:
98 | print(
99 | '"-h" and "--help" should be after "--", examples:\n > python -m watchdogs -- -h\n > python run_server.py -- -h'
100 | )
101 | return
102 | Fire(start_app)
103 |
104 |
105 | if __name__ == "__main__":
106 | main()
107 |
--------------------------------------------------------------------------------
/quick_start.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Given a mission
5 | get the most popular repository in the github python trending page.
6 | 1. Here crawl and parse the HTML from https://github.com/trending/python?since=daily
7 | 2. ~~Although you can get it from the api.github.com~~
8 |
9 | # Create a CrawlerRule
10 |
11 | 1. Get the request args
12 | 1. Use the URL: https://github.com/trending/python?since=daily
13 | 2. Or copy the curl-string from chrome
14 | 1. Chrome dev tools -> Network -> url (RClick) -> Copy -> Copy as cURL
15 | 2. some scenes need the cookie authentication or headers anti-crawler.
16 | 3. 
17 | 2. Create crawler rule
18 | 1. Open watchdog page. Default http://127.0.0.1:9901/
19 | 2. Click \ tab.
20 | 3. First step is to set the CrawlerRule's meta info.
21 | 1. Now start to ensure the request is correct.
22 | 2. Click \ link.
23 | 3. Input the cURL string or URL got before.
24 | 4. 
25 | 5. Then it generates the default regex & request args, maybe need some change for match more url pattern.
26 | 6. Click \ button, wait for it finish downloading => Response Body [200]
27 | 1. If after downloading, \ is still null, need to input manually.
28 | 7. Check the source code downloaded, ensure it is what you want.
29 | 1. Also you can check it in the parse rules by using a rule named `__schema__`, the parser will raise Error except this `__schema__` rule returns `True`.
30 | 4. Now to set the ParseRules of this CrawlerRule.
31 | 1. A valid CrawlerRule should contains `text` rule and `url` rule, and the `url` rule is optional.
32 | 2. Delete the existing text rule, create a new parse rule named `list`.
33 | 3. Create a new Parse Rule like as below: 
34 | 1. Here we got the list item for child rules.
35 | 4. Then need two child rules named `text` and `url` for the `list` rule.
36 | 5. Create a new parse rule named `text` like this: 
37 | 1. Click the button send the `text` rule to `list` rule.
38 | 6. Create a new parse rule named `url` like `text`, or ignore this rule. But `$text` attribute should use `@href` for get href attribute. Also need to send this rule to `list` rule.
39 | 5. OK, now click `Parse` button to parse this CrawlerRule, and get the result.
40 | 6. Click the \<1. Save Crawler Rule\> button to save rule into database.
41 |
42 | > Parse result
43 |
44 | ```javascript
45 | {'Trending Python repositories on GitHub today · GitHub': {'list': {'text': 'gwen001 / pentest-tools', 'url': 'https://github.com/gwen001/pentest-tools'}}}
46 | ```
47 |
48 | > CrawlerRule JSON. This JSON string can be loaded by clicking the \ button.
49 |
50 | ```javascript
51 | {"name":"Trending Python repositories on GitHub today · GitHub","request_args":{"method":"get","url":"https://github.com/trending/python?since=daily","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"list","chain_rules":[["css","h1.lh-condensed>a","$string"],["python","index","0"],["re","=\"/","@=\"https://github.com/"]],"child_rules":[{"name":"text","chain_rules":[["css","a","$text"],["py","index","0"],["udf","input_object.strip().replace('\\n', '')",""]],"child_rules":[],"iter_parse_child":false},{"name":"url","chain_rules":[["css","a","@href"],["python","index","0"]],"child_rules":[],"iter_parse_child":false}],"iter_parse_child":false}],"regex":"^https://github\\.com/trending/python\\?since=daily$","encoding":""}
52 | ```
53 |
54 | # Create a Task
55 |
56 | 1. Click the \<2. Add New Task\> button.
57 | 2. Ensure the task info. 
58 | 3. Click \ button. Create task success.
59 |
60 | # Update a Task
61 |
62 | 1. Click \ tab.
63 | 2. Double click the task's row.
64 | 3. Update it, submit.
65 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [watchdogs](https://github.com/ClericPy/watchdogs) [](https://pypi.org/project/watchdogs/)
2 |
3 | Keep an eye on the change of web world.
4 |
5 | Such as `post articles` / `news on the web portal` / `server api health` / `binge-watching` / `steam price fluctuation` / `github events` / `updates of comic and novel`, and so on...
6 |
7 | ## Intro
8 |
9 | > [中文文档](https://clericpy.github.io/blog/posts/20200331171211/)
10 |
11 | 1. This is a web app based on [fastapi](https://github.com/tiangolo/fastapi), [databases](https://github.com/encode/databases), [uniparser](https://github.com/ClericPy/uniparser), [torequests](https://github.com/ClericPy/torequests).
12 | 2. Smoothly deploy it by pip: `pip install -U watchdogs;python3 -m watchdogs`
13 | 3. Simple to create a new crawler with the Web UI, not like old ways to write duplicate code.
14 | 4. All the crawlers keep runing in the async environment.
15 | 5. Almost all the elements have a *title* attribute to describe the features in the Web UI, which means docs lay on the UI.
16 | 6. Release your hands from repetitive refreshing pages on the browser.
17 | 1. Subscribe the change events with RSS reminder extensions, such as [Feedbro](https://chrome.google.com/webstore/detail/feedbro/mefgmmbdailogpfhfblcnnjfmnpnmdfa) or RSS Feed Reader.
18 | 2. Implement a class which inherits from `watchdogs.callbacks.Callback`.
19 |
20 | ## Usage
21 |
22 | 1. > pip install -U watchdogs
23 |
24 | 2. > python -m watchdogs
25 |
26 | 3. > Open the browser: http://127.0.0.1:9901
27 |
28 | ### Command line args
29 |
30 | > python -m watchdogs -- -h
31 |
32 | - **db_url**:
33 | > sqlite / mysql / postgresql(not test) url, which [databases](https://github.com/encode/databases) supports. Defaults to 'sqlite:///{HOME_PATH}/watchdogs/storage.sqlite'
34 | - **password**:
35 | > init password, if null can be set on the first visit on web.
36 | - **mute_std_log**:
37 | > remove stdout log for clean stream
38 | - **mute_file_log**:
39 | > ignore file log located at {HOME_PATH}/watchdogs folder.
40 | - **md5_salt**:
41 | > md5_salt for custom md5(password) / md5(rss_tag)
42 | - **config_dir**:
43 | > config dir to save the logs and config files, if using sqlite include sqlite file. defaults to {HOME_PATH}/watchdogs
44 | - **use_default_cdn**:
45 | > If Config.cdn_urls not set, and use_default_cdn is True, will use online js/css cdn links from staticfile.org.
46 | - **\*\*uvicorn_kwargs**:
47 | > uvicorn startup kwargs, such as port, host. Which can be set like: `python -m watchdogs --port=9999 --host=127.0.0.1 --access-log=False`
48 |
49 | ### Quick Start to Create New Task
50 |
51 | [Quick Start Screenshots](https://github.com/ClericPy/watchdogs/blob/master/quick_start.md)
52 |
53 |
54 | ## Web UI
55 |
56 |
57 | Screenshots
58 |
59 | 1. Welcome Page (Tasks Page).
60 | > Here you can see all the tasks meta, goto RSS / Mobile Lite Page, and do some operations to the tasks.
61 |
62 | 
63 |
64 | 2. New Task Page.
65 | > Here based on the latest [uniparser](https://github.com/ClericPy/uniparser) web app, to create new rules and also tasks.
66 |
67 | 
68 |
69 | 3. Rules Page.
70 | > Do some operations for the rules.
71 |
72 | 
73 |
74 | 4. API page.
75 | > Based on [fastapi](https://github.com/tiangolo/fastapi) `/docs` which is generated automatically.
76 |
77 | 
78 |
79 | 5. Mobile Page (Lite View).
80 | > For mobile phone to glimpse the latest result for the current 30 tasks.
81 |
82 | 
83 |
84 |
85 |
86 |
92 |
--------------------------------------------------------------------------------
/watchdogs/templates/groups.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Watchdogs Groups v{{version}}
10 |
11 |
12 |
50 |
51 |
52 |
53 |
54 |
55 |
58 |
59 |
60 |
62 |
63 |
64 |
65 | {% for group in groups %}
66 |
77 | {% endfor %}
78 |
79 |
80 |
81 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/watchdogs/static/img/favicon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 |
15 |
17 |
19 |
56 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/watchdogs/templates/feeds.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Watchdogs Timeline v{{version}}
10 |
11 |
12 |
172 |
173 |
174 |
175 |
176 |
179 |
180 |
181 |
182 | {% for task in error_tasks %}
183 |
[{{task.name}}]: {{task.error}}
184 | {% endfor %}
185 |
186 | {%- if not feeds -%}
187 |
No Feeds.
188 | {% endif %}
189 | {% for feed in feeds %}
190 | {%- if feed.get("name") -%}
191 |
192 |
195 |
{{feed.text}}
196 |
197 | {{feed.ts_create.strftime('%Y-%m-%d %H:%M:%S')}}
198 |
199 | - {{feed.timeago}} ago
200 |
201 |
202 | {% endif %}
203 | {%- if feed.get("current_date") -%}
204 |
{{feed["current_date"]}}
205 | {% endif %}
206 | {% endfor %}
207 |
208 |
209 |
210 | {% if last_page_url %}
211 |
<
212 | {% endif %}
213 |
214 |
215 | {% if next_page_url %}
216 |
>
217 | {% endif %}
218 |
219 |
220 |
223 |
224 |
225 |
226 |
--------------------------------------------------------------------------------
/watchdogs/callbacks.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from json import loads
3 | from logging import getLogger
4 | from traceback import format_exc
5 | from typing import Dict, Type
6 |
7 | from torequests.utils import ttime
8 |
9 | from .utils import ensure_await_result
10 |
11 |
12 | class CallbackHandlerBase(ABC):
13 | logger = getLogger('watchdogs')
14 |
15 | def __init__(self):
16 | # lazy init object
17 | self.callbacks_dict: Dict[str, Type[Callback]] = {}
18 | for cls in Callback.__subclasses__():
19 | try:
20 | assert cls.name is not None
21 | cls.doc = cls.doc or cls.__doc__
22 | self.callbacks_dict[cls.name] = cls
23 | except Exception as err:
24 | self.logger.error(f'{cls} registers failed: {err!r}')
25 | self.workers = {cb.name: cb.doc for cb in self.callbacks_dict.values()}
26 |
27 | @abstractmethod
28 | async def callback(self, task):
29 | pass
30 |
31 | def get_callback(self, name):
32 | obj = self.callbacks_dict.get(name)
33 | if not obj:
34 | # not found callback
35 | return None
36 | if not isinstance(obj, Callback):
37 | # here for lazy init
38 | obj = obj()
39 | self.callbacks_dict[name] = obj
40 | return obj
41 |
42 |
43 | class CallbackHandler(CallbackHandlerBase):
44 |
45 | def __init__(self):
46 | super().__init__()
47 |
48 | async def callback(self, task):
49 | custom_info: str = task.custom_info.strip()
50 | name = custom_info.split(':', 1)[0]
51 | cb = self.get_callback(name) or self.get_callback('')
52 | if not cb:
53 | # not found callback, ignore
54 | return
55 | try:
56 | call_result = await ensure_await_result(cb.callback(task))
57 | self.logger.info(
58 | f'{cb.name or "default"} callback({custom_info}) for task {task.name} {call_result}: '
59 | )
60 | except Exception:
61 | self.logger.error(
62 | f'{cb.name or "default"} callback({custom_info}) for task {task.name} error:\n{format_exc()}'
63 | )
64 |
65 |
66 | class Callback(ABC):
67 | """
68 | Constraint: Callback object should has this attribute:
69 | cls.name: str
70 | self.callback(task)
71 | if name == '': It's the default callback for null custom info.
72 | More common notify middleware is coming.
73 | """
74 | logger = getLogger('watchdogs')
75 | # reset by subclass
76 | name: str = None
77 | doc = ''
78 |
79 | @abstractmethod
80 | def callback(self, task):
81 | """task attributes is new crawled"""
82 | pass
83 |
84 |
85 | class ServerChanCallback(Callback):
86 | """
87 | Wechat notify toolkit.
88 |
89 | 1. Login with github: http://sc.ftqq.com/
90 | 2. Click http://sc.ftqq.com/?c=code the SCKEY
91 | 3. Set the task.custom_info as: server_chan:{SCKEY}
92 | """
93 | name = "server_chan"
94 |
95 | # doc = 'http://sc.ftqq.com/'
96 | TEXT_SLICE_LENGTH = 200
97 |
98 | def __init__(self):
99 | from torequests.dummy import Requests
100 | self.req = Requests()
101 |
102 | async def callback(self, task):
103 | name, arg = task.custom_info.split(':', 1)
104 | if not arg:
105 | raise ValueError(
106 | f'{task.name}: custom_info `{task.custom_info}` missing args after `:`'
107 | )
108 | latest_result = loads(task.latest_result or '{}')
109 | text = latest_result.get('text') or ''
110 | url = latest_result.get('url') or task.origin_url
111 | title = f'{task.name}#{text[:self.TEXT_SLICE_LENGTH]}'
112 | body = f'{url}\n\n{text}'
113 | oks = []
114 | for key in set(arg.strip().split()):
115 | if not key or not key.strip():
116 | continue
117 | key = key.strip()
118 | r = await self.req.post(f'https://sc.ftqq.com/{key}.send',
119 | data={
120 | 'text': title,
121 | 'desp': body
122 | })
123 | self.logger.info(f'ServerChanCallback ({key}): {r.text}')
124 | oks.append((key, bool(r)))
125 | return f'{len(oks)} sended, {oks}'
126 |
127 |
128 | class DingTalkCallback(Callback):
129 | """
130 | DingDing robot notify toolkit. Will auto check msg type as text / card.
131 |
132 | 1. Create a group.
133 | 2. Create a robot which contains word ":"
134 | 3. Set the task.custom_info as: dingding:{access_token}
135 |
136 | Doc: https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq/e9d991e2
137 | """
138 | name = "dingding"
139 |
140 | def __init__(self):
141 | from torequests.dummy import Requests
142 | self.req = Requests()
143 |
144 | def make_data(self, task):
145 | latest_result = loads(task.latest_result or '{}')
146 | title = latest_result.get('title') or ''
147 | url = latest_result.get('url') or task.origin_url
148 | text = latest_result.get('text') or ''
149 | cover = latest_result.get('cover') or ''
150 | if cover:
151 | text = f'\n{text}'
152 | if url or cover:
153 | # markdown
154 | title = f'# {task.name}: {title}\n> {ttime()}'
155 | return {
156 | "actionCard": {
157 | "title": title,
158 | "text": f'{title}\n\n{text}',
159 | "singleTitle": "Read More",
160 | "singleURL": url
161 | },
162 | "msgtype": "actionCard"
163 | }
164 | return {
165 | "msgtype": "text",
166 | "text": {
167 | "content": f"{task.name}: {title}\n{text}"
168 | }
169 | }
170 |
171 | async def callback(self, task):
172 | name, arg = task.custom_info.split(':', 1)
173 | if not arg:
174 | raise ValueError(
175 | f'{task.name}: custom_info `{task.custom_info}` missing args after `:`'
176 | )
177 |
178 | data = self.make_data(task)
179 | oks = []
180 | for access_token in set(arg.strip().split()):
181 | if not access_token or not access_token.strip():
182 | continue
183 | access_token = access_token.strip()
184 | r = await self.req.post(
185 | f'https://oapi.dingtalk.com/robot/send?access_token={access_token}',
186 | json=data)
187 | self.logger.info(
188 | f'{self.__class__.__name__} ({access_token}): {r.text}')
189 | oks.append((access_token, bool(r)))
190 | return f'{len(oks)} sended, {oks}'
191 |
--------------------------------------------------------------------------------
/watchdogs/templates/lite.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Watchdogs Lite v{{version}}
10 |
11 |
12 |
168 |
169 |
170 |
171 |
172 |
175 |
176 |
177 | {%- if not tasks -%}
178 |
No Tasks.
179 | {% endif %} {% for task in tasks %}
180 |
181 |
184 |
{{task.text}}
185 |
186 | {{ '✔' if task.enable else '✖'}}
187 | {{task.last_change_time.strftime('%Y-%m-%d %H:%M:%S')}}
188 |
189 | - {{task.timeago}} ago
190 | 📂
191 | View More
192 | {%- if task.error -%}
193 |
194 |
195 | Error: {{task.error}}
196 |
197 | {% endif %}
198 |
199 |
200 | {% endfor %}
201 |
202 |
203 |
204 | {% if last_page_url %}
205 |
<
206 | {% endif %}
207 |
208 |
209 | {% if next_page_url %}
210 |
>
211 | {% endif %}
212 |
213 |
214 |
265 |
266 |
267 |
268 |
--------------------------------------------------------------------------------
/watchdogs/config.py:
--------------------------------------------------------------------------------
1 | from logging import ERROR, INFO, Formatter, getLogger
2 | from pathlib import Path
3 | from time import time
4 | from traceback import format_exc
5 | from typing import Any, Callable, Dict, List
6 |
7 | from databases import Database
8 | from fastapi import Request
9 | from fastapi.middleware.gzip import GZipMiddleware
10 | from frequency_controller import AsyncFrequency
11 | from starlette.middleware.base import BaseHTTPMiddleware
12 | from starlette.responses import JSONResponse, RedirectResponse
13 | from torequests.utils import md5 as _md5
14 | from torequests.utils import parse_qsl, quote_plus, unparse_qsl
15 | from uniparser.crawler import RuleStorage
16 |
17 | from .callbacks import CallbackHandlerBase
18 |
19 | logger = getLogger('watchdogs')
20 | logger.setLevel(INFO)
21 |
22 | NotSet = object()
23 |
24 |
25 | # @app.exception_handler(Exception)
26 | async def exception_handler(request: Request, exc: Exception):
27 | trace_id = str(int(time() * 1000))
28 | err_name = exc.__class__.__name__
29 | err_value = str(exc)
30 | msg = f'{err_name}({err_value}) trace_id: {trace_id}:\n{format_exc()}'
31 | logger.error(msg)
32 | return JSONResponse(
33 | status_code=500,
34 | content={
35 | "message": f"Oops! {err_name}.",
36 | "trace_id": trace_id
37 | },
38 | )
39 |
40 |
41 | def ensure_dir(path: Path):
42 | if isinstance(path, str):
43 | path = Path(path)
44 | if path.is_dir():
45 | return path
46 | else:
47 | paths = list(reversed(path.parents))
48 | paths.append(path)
49 | p: Path
50 | for p in paths:
51 | if not p.is_dir():
52 | p.mkdir()
53 | return path
54 |
55 |
56 | def get_sign(path, query):
57 | given_sign = ''
58 | query_list = []
59 | for key, value in parse_qsl(query, keep_blank_values=True):
60 | if key == 'sign':
61 | given_sign = value
62 | else:
63 | query_list.append(f'{key}={value}')
64 | query_list.sort()
65 | valid_sign = md5(f'{path}?{"&".join(query_list)}')
66 | return given_sign, valid_sign
67 |
68 |
69 | async def auth_checker(request: Request, call_next):
70 | # {'type': 'http', 'http_version': '1.1', 'server': ('127.0.0.1', 9901), 'client': ('127.0.0.1', 7037), 'scheme': 'http', 'method': 'GET', 'root_path': '', 'path': '/auth', 'raw_path': b'/auth', 'query_string': b'', 'headers': [(b'host', b'127.0.0.1:9901'), (b'connection', b'keep-alive'), (b'sec-fetch-dest', b'image'), (b'user-agent', b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'), (b'dnt', b'1'), (b'accept', b'image/webp,image/apng,image/*,*/*;q=0.8'), (b'sec-fetch-site', b'same-origin'), (b'sec-fetch-mode', b'no-cors'), (b'referer', b'http://127.0.0.1:9901/auth'), (b'accept-encoding', b'gzip, deflate, br'), (b'accept-language', b'zh-CN,zh;q=0.9'), (b'cookie', b'ads_id=lakdsjflakjdf; _ga=GA1.1.1550108461.1583462251')], 'fastapi_astack': , 'app': }
71 | path = request.scope['path']
72 | if path in Config.AUTH_PATH_WHITE_LIST:
73 | # ignore auth check
74 | return await call_next(request)
75 | query_string = request.scope.get('query_string', b'').decode('u8')
76 | query_has_sign = 'sign=' in query_string
77 | if query_has_sign:
78 | # try checking sign
79 | given_sign, valid_sign = Config.get_sign(path, query_string)
80 | if given_sign == valid_sign:
81 | # sign checking pass
82 | return await call_next(request)
83 | # try check cookie
84 | if not Config.watchdog_auth or Config.watchdog_auth == request.cookies.get(
85 | 'watchdog_auth', ''):
86 | # valid cookie, or no watchdog_auth checker
87 | return await call_next(request)
88 | # not pass either checker, refused
89 | if query_has_sign:
90 | # request with sign will not redirect
91 | return JSONResponse(
92 | status_code=400,
93 | content={
94 | "message": 'signature expired',
95 | },
96 | )
97 | else:
98 | # bad cookie, reset the watchdog_auth cookie as null
99 | resp = RedirectResponse(
100 | f'/auth?redirect={quote_plus(request.scope["path"])}', 302)
101 | resp.set_cookie('watchdog_auth', '')
102 | return resp
103 |
104 |
105 | class Config:
106 | CONFIG_DIR: Path = ensure_dir(Path.home() / 'watchdogs')
107 | ENCODING = 'utf-8'
108 | AUTH_PATH_WHITE_LIST = {'/auth'}
109 | # db_url defaults to sqlite://
110 | db_url: str = f'sqlite:///{(CONFIG_DIR / "storage.sqlite").as_posix()}'
111 | db: Database = None
112 | logger = logger
113 | password: str = ''
114 | rule_db: RuleStorage = None
115 | metas = None
116 | check_interval: int = 60
117 | default_interval: int = 5 * 60
118 | default_crawler_timeout: int = 30
119 | downloader_timeout: int = 15
120 | watchdog_auth: str = ''
121 | md5_salt: str = ''
122 | crawler = None
123 | # anti brute force attack
124 | check_pwd_freq = AsyncFrequency(1, 3)
125 | # for anti-crawl frequency
126 | DEFAULT_HOST_FREQUENCY = (1, 1)
127 | cdn_urls: dict = {}
128 | callback_handler: CallbackHandlerBase = None
129 | mute_std_log = False
130 | mute_file_log = False
131 | LOGGING_FILE_CONFIG = {
132 | 'info.log': {
133 | 'file_size_mb': 2,
134 | 'level': INFO,
135 | 'backup_count': 1,
136 | },
137 | 'error.log': {
138 | 'file_size_mb': 2,
139 | 'level': ERROR,
140 | 'backup_count': 1,
141 | },
142 | 'server.log': {
143 | 'file_size_mb': 2,
144 | 'level': INFO,
145 | 'backup_count': 1,
146 | },
147 | }
148 | DEFAULT_LOGGER_FORMATTER = Formatter(
149 | "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s",
150 | datefmt="%Y-%m-%d %H:%M:%S")
151 | uvicorn_kwargs: dict = {'access_log': True, 'port': 9901}
152 | # check interval 60s, so format do use %M , backup every 12 hours. this pattern may miss for crawl cost more than 60s.
153 | # db_backup_time: str = '%H:%M==00:00|%H:%M==12:00'
154 | db_backup_time: str = '%H:%M==00:00'
155 | db_backup_count: int = 4
156 | db_backup_function: Callable[..., Any] = None
157 | exception_handlers: list = [
158 | (Exception, exception_handler),
159 | ]
160 | middlewares = [
161 | {
162 | 'middleware_class': BaseHTTPMiddleware,
163 | 'dispatch': auth_checker
164 | },
165 | {
166 | 'middleware_class': GZipMiddleware,
167 | 'minimum_size': 1000
168 | },
169 | ]
170 | md5_cache_maxsize = 128
171 | query_groups_cache_maxsize = 128
172 | query_group_task_ids_cache_maxsize = 128
173 | query_task_ids_cache_maxsize = 128
174 | query_tasks_cache_maxsize = 128
175 | query_feeds_cache_maxsize = 128
176 | metas_cache_maxsize = 128
177 | sign_cache_maxsize = 128
178 | _md5 = _md5
179 | get_sign = get_sign
180 | background_task = None
181 | background_funcs: List[Callable] = []
182 | is_shutdown = False
183 | custom_links = [
184 | {
185 | 'label': 'Auth',
186 | 'url': '/auth',
187 | 'desc': 'change your password',
188 | },
189 | {
190 | 'label': 'Logs',
191 | 'url': '/log',
192 | 'desc': 'view the logs',
193 | },
194 | {
195 | 'label': 'Docs',
196 | 'url': '/docs',
197 | 'desc': 'read the docs',
198 | },
199 | {
200 | 'label': 'Groups',
201 | 'url': '/groups',
202 | 'desc': 'admin the groups',
203 | },
204 | ]
205 | # custom_tabs = [{'name': 'apis', 'label': 'API', 'url': '/docs'}]
206 | custom_tabs: List[Dict] = []
207 | COLLATION: str = None
208 | cookie_max_age = 86400 * 7
209 | default_page_size = 20
210 | TEXT_SLICE_LENGTH = 200
211 |
212 | @classmethod
213 | def get_route(cls, path, **kwargs):
214 | params_string = unparse_qsl([
215 | (k, str(v)) for k, v in kwargs.items() if str(v)
216 | ])
217 | sign = cls.get_sign(path, params_string)[1]
218 | if params_string:
219 | result = f'{path}?{params_string}&sign={sign}'
220 | else:
221 | result = f'{path}?sign={sign}'
222 | return result
223 |
224 | @classmethod
225 | def add_custom_tabs(cls, label, url, name=None, desc=None):
226 | # desc is nonsense
227 | assert name or label
228 | cls.custom_tabs.append({
229 | 'label': label,
230 | 'name': name or label,
231 | 'url': url,
232 | 'desc': desc
233 | })
234 |
235 | @classmethod
236 | def add_custom_links(cls, url, name, label=None, desc=None):
237 | assert name or label
238 | cls.custom_tabs.append({
239 | 'name': name or label,
240 | 'label': label or name,
241 | 'url': url,
242 | 'desc': desc
243 | })
244 |
245 | @classmethod
246 | def setup_middleware(cls, app):
247 | for middleware in cls.middlewares:
248 | app.add_middleware(**middleware)
249 |
250 |
251 | def md5(obj, n=32, with_salt=True):
252 | if not with_salt:
253 | return Config._md5(str(obj).encode('utf-8'), n=n, skip_encode=True)
254 | salt = Config.md5_salt
255 | if not salt:
256 | raise ValueError('Config.md5_salt should not be null')
257 | return Config._md5(f'{obj}{salt}'.encode('utf-8'), n=n)
258 |
259 |
260 | async def md5_checker(obj, target, freq=False):
261 | if freq:
262 | async with Config.check_pwd_freq:
263 | # anti guessing password
264 | return md5(obj) == target
265 | else:
266 | # may get a cache
267 | return md5(obj) == target
268 |
--------------------------------------------------------------------------------
/watchdogs/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datetime import datetime
3 | from inspect import isawaitable
4 | from json import dumps, loads
5 | from logging import getLogger
6 | from sys import _getframe
7 | from traceback import format_exc
8 | from typing import Optional
9 | from xml.sax.saxutils import escape
10 |
11 | logger = getLogger('watchdogs')
12 |
13 |
14 | def format_size(size, rounded=2):
15 | unit = 'B'
16 | for _unit in ['B', 'KB', 'MB', 'GB']:
17 | unit = _unit
18 | if size > 1024:
19 | size = size / 1024
20 | else:
21 | break
22 | return f'{round(size, rounded)} {unit}'
23 |
24 |
25 | async def ensure_await_result(result):
26 | if isawaitable(result):
27 | return await result
28 | return result
29 |
30 |
31 | def _check_work_time(work_hours, now: Optional[datetime] = None):
32 | now = now or datetime.now()
33 | if '==' in work_hours:
34 | # check work days, using strftime
35 | fmt, target = work_hours.split('==')
36 | current = now.strftime(fmt)
37 | # check current time format equals to target
38 | return current == target
39 | elif '!=' in work_hours:
40 | # check work days, using strftime
41 | fmt, target = work_hours.split('!=')
42 | current = now.strftime(fmt)
43 | # check current time format equals to target
44 | return current != target
45 | else:
46 | # other hours format
47 | current_hour = now.hour
48 | if work_hours[0] == '[' and work_hours[-1] == ']':
49 | work_hours_list = sorted(loads(work_hours))
50 | else:
51 | nums = [int(num) for num in re.findall(r'\d+', work_hours)]
52 | work_hours_list = sorted(range(*nums))
53 | # check if current_hour is work hour
54 | return current_hour in work_hours_list
55 |
56 |
57 | def check_work_time(work_hours, now: Optional[datetime] = None):
58 | """Check time if fit work_hours.
59 |
60 | :: Test Code
61 |
62 | from watchdogs.utils import check_work_time, datetime
63 |
64 | now = datetime.strptime('2020-03-14 11:47:32', '%Y-%m-%d %H:%M:%S')
65 |
66 | oks = [
67 | '0, 24',
68 | '[1, 2, 3, 11]',
69 | '[1, 2, 3, 11];%Y==2020',
70 | '%d==14',
71 | '16, 24|[11]',
72 | '16, 24|%M==47',
73 | '%M==46|%M==47',
74 | '%H!=11|%d!=12',
75 | '16, 24|%M!=41',
76 | ]
77 |
78 | for work_hours in oks:
79 | ok = check_work_time(work_hours, now)
80 | print(ok, work_hours)
81 | assert ok
82 |
83 | no_oks = [
84 | '0, 5',
85 | '[1, 2, 3, 5]',
86 | '[1, 2, 3, 11];%Y==2021',
87 | '%d==11',
88 | '16, 24|[12]',
89 | '%M==17|16, 24',
90 | '%M==46|[1, 2, 3]',
91 | '%H!=11&%d!=12',
92 | '%M!=46;%M!=47',
93 | ]
94 |
95 | for work_hours in no_oks:
96 | ok = check_work_time(work_hours, now)
97 | print(ok, work_hours)
98 | assert not ok
99 |
100 |
101 | """
102 | now = now or datetime.now()
103 | if '|' in work_hours:
104 | if '&' in work_hours or ';' in work_hours:
105 | raise ValueError('| can not use with "&" or ";"')
106 | return any((_check_work_time(partial_work_hour, now)
107 | for partial_work_hour in work_hours.split('|')))
108 | else:
109 | if ('&' in work_hours or ';' in work_hours) and '|' in work_hours:
110 | raise ValueError('| can not use with "&" or ";"')
111 | return all((_check_work_time(partial_work_hour, now)
112 | for partial_work_hour in re.split('&|;', work_hours)))
113 |
114 |
115 | def get_watchdog_result(item):
116 | """
117 | Parse result format like:
118 | {'text': 'xxx'}
119 | {'text': 'xxx', 'url': 'xxx'}
120 | {'rule_name': {'text': 'xxx'}}
121 | {'__result__': {'rule_name': {'text': 'xxx'}}}
122 |
123 | def test_result_schema():
124 | # standard result
125 | result = get_watchdog_result({
126 | 'url': 'https://www.python.org/dev/peps/pep-0001',
127 | 'text': 'text'
128 | })
129 | # print(result)
130 | assert result == {
131 | 'url': 'https://www.python.org/dev/peps/pep-0001',
132 | 'text': 'text'
133 | }
134 | # only text
135 | result = get_watchdog_result('https://www.python.org/dev/peps/pep-0001')
136 | # print(result)
137 | assert result == {'text': 'text not found'}
138 | # embed request
139 | result = get_watchdog_result({
140 | '__request__': 'https://www.python.org/dev/peps/pep-0001',
141 | '__result__': {
142 | 'detail': {
143 | 'text': 'PEP 1 -- PEP Purpose and Guidelines'
144 | }
145 | }
146 | })
147 | # print(result)
148 | assert result == {'text': 'PEP 1 -- PEP Purpose and Guidelines'}
149 | # embed request list
150 | result = get_watchdog_result({
151 | '__request__': 'https://www.python.org/dev/peps/pep-0001',
152 | '__result__': {
153 | 'detail': [{
154 | 'text': 'PEP 1 -- PEP Purpose and Guidelines'
155 | }]
156 | }
157 | })
158 | # print(result)
159 | assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}]
160 | # embed request list2
161 | result = get_watchdog_result({
162 | '__request__': 'https://www.python.org/dev/peps/pep-0001',
163 | '__result__': {
164 | 'rule_name': {
165 | '__result__': {
166 | 'detail': [{
167 | 'text': 'PEP 1 -- PEP Purpose and Guidelines'
168 | }]
169 | }
170 | }
171 | }
172 | })
173 | # print(result)
174 | assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}]
175 | # child rule result
176 | result = get_watchdog_result({
177 | 'url': 'https://www.python.org/dev/peps/pep-0001',
178 | 'text': 'text'
179 | })
180 | # print(result)
181 | assert result == {
182 | 'text': 'text',
183 | 'url': 'https://www.python.org/dev/peps/pep-0001'
184 | }
185 | result = get_watchdog_result({
186 | 'list': {
187 | 'detail': [{
188 | 'text': 'Wake up to WonderWidgets!',
189 | 'url': 'all'
190 | }, {
191 | 'text': 'Overview',
192 | 'url': 'all'
193 | }]
194 | }
195 | })
196 | # print(result)
197 | assert result == [{
198 | 'text': 'Wake up to WonderWidgets!',
199 | 'url': 'all'
200 | }, {
201 | 'text': 'Overview',
202 | 'url': 'all'
203 | }]
204 |
205 | """
206 | result = {'text': 'text not found'}
207 | if isinstance(item, dict):
208 | __result__ = item.pop('__result__', None)
209 | if __result__:
210 | # may be __result__ > __result__ > __result__ nested...
211 | return get_watchdog_result(__result__.popitem()[1])
212 | text = item.get('text')
213 | if text is None:
214 | return get_watchdog_result(item.popitem()[1])
215 | result = {'text': str(text)}
216 | for key in ['__key__', 'unique', 'key', 'cover', 'url', 'title']:
217 | if key in item:
218 | value = item[key]
219 | if value and str(value):
220 | result[key] = str(value)
221 |
222 | elif isinstance(item, (list, tuple)):
223 | result = [get_watchdog_result(i) for i in item]
224 | return result
225 |
226 |
227 | class SoloLock:
228 |
229 | def __init__(self):
230 | self.runnings: set = set()
231 |
232 | @property
233 | def current_name(self):
234 | return _getframe(2).f_code.co_name
235 |
236 | def acquire(self, name=None):
237 | name = name or self.current_name
238 | if name in self.runnings:
239 | raise RuntimeError(f'[{name}] is still running.')
240 | self.runnings.add(name)
241 |
242 | def release(self, name=None):
243 | name = name or self.current_name
244 | self.runnings.discard(name)
245 |
246 | def __enter__(self):
247 | self.acquire(self.current_name)
248 | return self
249 |
250 | def __exit__(self, *args):
251 | self.release(self.current_name)
252 | return self
253 |
254 |
255 | async def try_catch(func, *args, **kwargs):
256 | try:
257 | return await ensure_await_result(func(*args, **kwargs))
258 | except BaseException as err:
259 | logger.error(
260 | f'Catch an error while running {func.__name__}: {format_exc()}')
261 | return err
262 |
263 |
264 | def ignore_error(func, *args, **kwargs):
265 | try:
266 | return func(*args, **kwargs)
267 | except BaseException as err:
268 | return err
269 |
270 |
271 | def gen_rss(data):
272 | nodes = []
273 | channel = data['channel']
274 | item_keys = ['title', 'description', 'link', 'guid', 'pubDate']
275 | for item in data['items']:
276 | item_nodes = []
277 | for key in item_keys:
278 | value = item.get(key)
279 | if value:
280 | item_nodes.append(f'<{key}>{escape(value)}{key}>')
281 | nodes.append(''.join(item_nodes))
282 | items_string = ''.join((f'- {tmp}
' for tmp in nodes))
283 | return rf'''
284 |
285 |
286 | {channel['title']}
287 | {channel['link']}
288 | {channel['description']}
289 |
290 | {channel['link']}/static/img/favicon.svg
291 | {channel['title']}
292 | {channel['link']}
293 | 32
294 | 32
295 |
296 | {items_string}
297 |
298 |
299 | '''
300 |
301 |
302 | def get_result_key(result: dict):
303 | key = result.get('__key__', result.get('key'))
304 | if key:
305 | return key
306 | else:
307 | return dumps(result, sort_keys=True)
308 |
309 |
310 | solo = SoloLock()
311 |
--------------------------------------------------------------------------------
/watchdogs/static/js/clipboard.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * clipboard.js v2.0.4
3 | * https://zenorocha.github.io/clipboard.js
4 | *
5 | * Licensed MIT © Zeno Rocha
6 | */
7 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n({activeName:"tasks",uniparser_iframe_loaded:!1,task_info_visible:!1,rule_info_visible:!1,current_host_rule:{},new_task_form:{},has_more:!0,task_list:[],current_page:0,host_list:[],visible_host_list:[],current_host:"",tag_types:["","success","info","warning","danger"],query_tasks_args:{order_by:"last_change_time",sort:"desc",tag:""},callback_workers:{},custom_links:[],custom_tabs:[],current_cb_doc:"",init_iframe_rule_json:"",clicked_tab_names:{}}),methods:{add_new_task(){try{JSON.parse(this.new_task_form.result_list)}catch(e){this.$alert("Invalid JSON for result_list.");return}try{JSON.parse(this.new_task_form.request_args)}catch(s){this.$alert("Invalid JSON for request_args.");return}this.task_info_visible=!1;let t=JSON.stringify(this.new_task_form);this.$http.post("add_new_task",t).then(e=>{var s=e.body;"ok"==s.msg?(this.$message({message:"Update task "+this.new_task_form.name+" success: "+s.msg,type:"success"}),this.reload_tasks()):this.$message.error({message:"Update task "+this.new_task_form.name+" failed: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},init_iframe_crawler_rule(e){e?this.sub_app.new_rule_json=e:/httpbin\.org\/html/g.test(this.sub_app.new_rule_json)?this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"https://importpython.com/blog/feed/","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["xml","channel>item>title","$text"],["python","getitem","[0]"]],"child_rules":""},{"name":"url","chain_rules":[["xml","channel>item>link","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^https?://importpython.com/blog/feed/$","encoding":""}':this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"http://httpbin.org/html","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["css","body h1","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^http://httpbin.org/html$","encoding":""}',this.sub_app.input_object="",this.sub_app.request_status="",this.sub_app.load_rule()},load_rule(e){this.sub_app.new_rule_json=e,this.sub_app.load_rule()},view_host_by_req(e){let s=JSON.parse(e).url;if(!s){this.$alert("request_args.url should not be null");return}document.getElementById("tab-rules").click(),setTimeout(()=>{this.current_host=new URL(s).hostname},0),this.task_info_visible=!1},view_crawler_rule_by_req(e){if(!e){this.$alert("request_args should not be null");return}this.$http.post("find_crawler_rule",e).then(e=>{var s=e.body;if("ok"==s.msg){let t=JSON.parse(s.result);this.view_crawler_rule(t),this.task_info_visible=!1}else this.$message.error({message:"rule not find in db: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},view_crawler_rule(e){this.rule_info_visible=!1,document.getElementById("tab-new").click(),this.uniparser_iframe_loaded?this.init_iframe_crawler_rule(JSON.stringify(e)):this.init_iframe_rule_json=JSON.stringify(e)},edit_crawler_rule(e){this.$prompt("","Edit Crawler JSON",{confirmButtonText:"OK",cancelButtonText:"Cancel",center:!0,inputType:"textarea",closeOnClickModal:!1,inputValue:JSON.stringify(e,null,2)}).then(({value:e})=>{this.process_crawler_rule("add",JSON.parse(e),0)}).catch(e=>{this.$message({type:"error",message:e})})},process_crawler_rule(e,s,t){let r=JSON.stringify(s||JSON.parse(this.sub_app.current_crawler_rule_json)),a="crawler_rule."+e;1==t&&(a+="?force=1"),this.$http.post(a,r).then(t=>{var r=t.body;"ok"==r.msg?(this.$message({message:e+" rule success",type:"success"}),"pop"==e&&r.result&&this.show_host_rule(this.current_host_rule.host)):"add"==e&&/matched more than 1 rule/g.test(r.msg)?this.$confirm("Failed for url matched more than 1 rule, overwrite it?","Confirm",{confirmButtonText:"Yes",cancelButtonText:"No",type:"error"}).then(()=>{this.process_crawler_rule(e,s,1)}).catch(()=>{this.$message({type:"info",message:"Adding rule canceled."})}):this.$message.error({message:e+" rule failed: "+r.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},show_form_add_new_task(e){if(e){let s="";try{s=this.sub_app.crawler_rule.name}catch(t){console.log(t)}this.new_task_form={task_id:null,name:s,enable:1,tag:"default",error:"",request_args:"",origin_url:"",interval:300,work_hours:"0, 24",max_result_count:30,result_list:"[]",custom_info:""};let r=JSON.parse(this.sub_app.current_crawler_rule_json);this.new_task_form.request_args=JSON.stringify(r.request_args),this.new_task_form.origin_url=r.request_args.url||""}this.task_info_visible=!0},change_enable(e){this.$http.get("enable_task",{params:{task_id:e.task_id,enable:e.enable}}).then(e=>{var s=e.body;"ok"!=s.msg&&this.$message.error({message:"Update enable failed: "+s.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})},sort_change(e){this.query_tasks_args={order_by:e.column.label,sort:(e.column.order||"").replace("ending","")},this.reload_tasks()},reload_tasks(){this.task_list=[],this.current_page=0,this.load_tasks()},load_tasks(){let e=new URLSearchParams(window.location.search).get("tag");e?this.query_tasks_args.tag=e:this.query_tasks_args.tag="",current_page=this.current_page+1,this.query_tasks_args.page=current_page,this.$http.get("load_tasks",{params:this.query_tasks_args}).then(e=>{var s=e.body;"ok"==s.msg?(s.tasks.forEach(e=>{this.task_list.push(e)}),this.has_more=s.has_more,this.current_page=current_page):(this.$message.error({message:"Loading tasks failed: "+s.msg}),this.has_more=s.has_more)},e=>{this.$message.error({message:"connect failed: "+e.status})})},load_hosts(){this.$http.get("load_hosts",{params:{host:this.current_host}}).then(e=>{var s=e.body;this.current_host=s.host||"",this.host_list=s.hosts,this.visible_host_list=this.host_list},e=>{this.$message.error({message:"connect failed: "+e.status})})},init_iframe(){this.sub_app&&(this.init_iframe_crawler_rule(this.init_iframe_rule_json),this.init_iframe_rule_json&&(this.$message.success({message:"Rule loaded."}),this.init_iframe_rule_json=""),this.uniparser_iframe_loaded=!0)},handleClick(e){e.name in this.clicked_tab_names||(this.clicked_tab_names[e.name]=1,"rules"==e.name&&this.load_hosts())},escape_html:e=>e?e.replace(/[&<>'"]/g,e=>({"&":"&","<":"<",">":">","'":"'",'"':"""})[e]||e):"",show_time(e){var s='';JSON.parse(e.result_list||"[]"),s+='last_check_time '+e.last_check_time.replace(/\..*/,"").replace("T"," ")+" ",s+='next_check_time '+e.next_check_time.replace(/\..*/,"").replace("T"," ")+" ",s+='last_change_time '+e.last_change_time.replace(/\..*/,"").replace("T"," ")+" ",s+="
",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},get_latest_result(e,s=80){try{let t=JSON.parse(e);return t.title||t.text.slice(0,s)}catch(r){return e}},show_result_list(e){var s="",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},force_crawl(e,s){this.$http.get("force_crawl",{params:{task_name:s.name}}).then(t=>{var r=t.body;if("ok"==r.msg){let a=r.task;Vue.set(this.task_list,e,a),a.error?this.$message.error({message:"Crawl task "+s.name+" "+a.error}):this.$message.success({message:"Crawl task "+s.name+" success"})}else this.$message.error({message:"Crawl task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"force_crawl connect failed: "+e.status})})},row_db_click(e){this.update_task(e)},show_task_error(e){app.$alert(e.error,"Crawler Error",{closeOnClickModal:!0,closeOnPressEscape:!0,center:!0})},update_task(e){this.new_task_form={task_id:e.task_id,name:e.name,enable:e.enable,tag:e.tag,request_args:e.request_args,origin_url:e.origin_url,interval:e.interval,work_hours:e.work_hours,max_result_count:e.max_result_count,result_list:e.result_list||"[]",custom_info:e.custom_info},this.show_form_add_new_task(!1)},delete_task(e,s){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_task",{params:{task_id:s.task_id}}).then(t=>{var r=t.body;"ok"==r.msg?(this.$message.success({message:"Delete task "+s.name+" success"}),this.task_list.splice(e,1)):this.$message.error({message:"Delete task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},delete_host_rule(e){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.$message.success({message:"Delete host "+e+" rule success"}),this.current_host_rule={},this.rule_info_visible=!1,this.load_hosts()):this.$message.error({message:"Delete host "+e+" rule failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},show_host_rule(e){this.$http.get("get_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.current_host_rule=t.host_rule,this.rule_info_visible=!0):this.$message.error({message:"get_host_rule "+e+" failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})},show_work_hours_doc(){let e=``;this.$alert(e,"work_hours format doc",{dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0,customClass:"work_hours_doc"})},check_error_task({row:e,rowIndex:s}){if(e.error)return"warning-row"},click_cb_name(e){this.current_cb_doc=this.callback_workers[e],this.new_task_form.custom_info=e+":"},update_frequency(){let e=this.current_host_rule.host,s=this.current_host_rule.n||0,t=this.current_host_rule.interval||0;this.$http.get("update_host_freq",{params:{host:e,n:s,interval:t}}).then(r=>{var a=r.body;"ok"==a.msg?(this.$message({message:"Update frequency "+e+": "+a.msg,type:"success"}),this.current_host_rule.n=s,this.current_host_rule.interval=t):this.$message.error({message:"update_frequency "+e+" failed: "+JSON.stringify(a)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}},watch:{current_host:function(e){this.visible_host_list=[],/^https?:\/\//g.test(e)&&(e=new URL(e).hostname,this.current_host=e),this.host_list.forEach(s=>{s.name.includes(e)&&this.visible_host_list.push(s)})},task_info_visible:function(e){e||(this.current_cb_doc="")}},computed:{uni_iframe:()=>document.getElementById("uni_iframe"),sub_app(){let e=this.uni_iframe;if(e)return e.contentWindow.app}}},vue_app=Vue.extend(Main),app=new vue_app({delimiters:["${","}"]}).$mount("#app");(()=>{var e;let s=document.getElementById("init_vars"),t=JSON.parse(window.atob(s.innerHTML));Object.keys(t).forEach(e=>{app[e]=t[e]}),s.parentNode.removeChild(s),new IntersectionObserver(e=>{!(e[0].intersectionRatio<=0)&&app.has_more&&app.load_tasks()}).observe(document.getElementById("auto_load"))})();
2 |
--------------------------------------------------------------------------------
/watchdogs/settings.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from asyncio import ensure_future, get_event_loop
3 | from datetime import datetime
4 | from functools import lru_cache
5 | from json import dumps, loads
6 | from logging.handlers import RotatingFileHandler
7 |
8 | from frequency_controller import AsyncFrequency
9 | from uniparser.parsers import Uniparser
10 |
11 | from .config import Config, NotSet, ensure_dir, md5
12 |
13 |
14 | def get_valid_value(values: list, default=None, invalid=NotSet):
15 | for value in values:
16 | if value is not invalid:
17 | return value
18 | return default
19 |
20 |
21 | def get_file_handler(file_name,
22 | file_size_mb=2,
23 | backup_count=1,
24 | level=logging.INFO):
25 | handler = RotatingFileHandler(
26 | Config.CONFIG_DIR / file_name,
27 | maxBytes=1024 * 1024 * Config.LOGGING_FILE_CONFIG.get(
28 | file_name, {}).get('file_size_mb', file_size_mb),
29 | backupCount=Config.LOGGING_FILE_CONFIG.get(file_name, {}).get(
30 | 'backup_count', backup_count),
31 | encoding=Config.ENCODING)
32 | handler.setLevel(
33 | Config.LOGGING_FILE_CONFIG.get(file_name, {}).get('level', level))
34 | handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER)
35 | return handler
36 |
37 |
38 | def get_stream_handler(level=logging.INFO):
39 | handler = logging.StreamHandler()
40 | handler.setLevel(level)
41 | handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER)
42 | return handler
43 |
44 |
45 | def setup_logger():
46 | watchdogs_logger = logging.getLogger('watchdogs')
47 | uniparser_logger = logging.getLogger('uniparser')
48 | uvicorn_logger = logging.getLogger('uvicorn')
49 | if not Config.mute_file_log:
50 | info_handler = get_file_handler('info.log')
51 | watchdogs_logger.addHandler(info_handler)
52 | uniparser_logger.addHandler(info_handler)
53 |
54 | error_handler = get_file_handler('error.log')
55 | watchdogs_logger.addHandler(error_handler)
56 | uniparser_logger.addHandler(error_handler)
57 |
58 | server_handler = get_file_handler('server.log')
59 | uvicorn_logger.addHandler(server_handler)
60 |
61 | if not Config.mute_std_log:
62 | handler = get_stream_handler()
63 | watchdogs_logger.addHandler(handler)
64 | uniparser_logger.addHandler(handler)
65 | uvicorn_logger.addHandler(handler)
66 | return watchdogs_logger
67 |
68 |
69 | def setup_models():
70 | from databases import Database
71 |
72 | # lazy import models to config cache size, means set cache after run main.init_app
73 | from .models import Metas, RuleStorageDB, create_tables
74 |
75 | Config.db = Database(Config.db_url)
76 | Config.rule_db = RuleStorageDB(Config.db)
77 | Config.metas = Metas(Config.db)
78 | # if Config.db_backup_function is None and Config.db_url.startswith(
79 | # 'sqlite:///'):
80 | # Config.db_backup_function = default_db_backup_sqlite
81 | create_tables(str(Config.db.url))
82 |
83 |
84 | async def setup_uniparser():
85 | import base64
86 | import binascii
87 | import datetime
88 | import math
89 | import random
90 | import re
91 |
92 | import uniparser.fastapi_ui
93 | from torequests.utils import (
94 | curlparse,
95 | escape,
96 | guess_interval,
97 | itertools_chain,
98 | json,
99 | parse_qs,
100 | parse_qsl,
101 | ptime,
102 | quote,
103 | quote_plus,
104 | slice_by_size,
105 | slice_into_pieces,
106 | split_n,
107 | timeago,
108 | ttime,
109 | unescape,
110 | unique,
111 | unquote,
112 | unquote_plus,
113 | urljoin,
114 | urlparse,
115 | urlsplit,
116 | urlunparse,
117 | )
118 | from uniparser.config import GlobalConfig
119 | from uniparser.parsers import UDFParser
120 | from uniparser.utils import TorequestsAiohttpAsyncAdapter
121 | UDFParser._GLOBALS_ARGS.update({
122 | 're': re,
123 | 'datetime': datetime,
124 | 'curlparse': curlparse,
125 | 'math': math,
126 | 'random': random,
127 | 'escape': escape,
128 | 'guess_interval': guess_interval,
129 | 'itertools_chain': itertools_chain,
130 | 'json': json,
131 | 'parse_qs': parse_qs,
132 | 'parse_qsl': parse_qsl,
133 | 'ptime': ptime,
134 | 'quote': quote,
135 | 'quote_plus': quote_plus,
136 | 'slice_by_size': slice_by_size,
137 | 'slice_into_pieces': slice_into_pieces,
138 | 'split_n': split_n,
139 | 'timeago': timeago,
140 | 'ttime': ttime,
141 | 'unescape': unescape,
142 | 'unique': unique,
143 | 'unquote': unquote,
144 | 'unquote_plus': unquote_plus,
145 | 'urljoin': urljoin,
146 | 'urlparse': urlparse,
147 | 'urlsplit': urlsplit,
148 | 'urlunparse': urlunparse,
149 | 'base64': base64,
150 | 'binascii': binascii,
151 | })
152 | GlobalConfig.GLOBAL_TIMEOUT = Config.downloader_timeout
153 | Uniparser._DEFAULT_ASYNC_FREQUENCY = AsyncFrequency(
154 | *Config.DEFAULT_HOST_FREQUENCY)
155 | await load_host_freqs()
156 | Config.uniparser = Uniparser(
157 | request_adapter=TorequestsAiohttpAsyncAdapter())
158 | uniparser.fastapi_ui.views.uni = Config.uniparser
159 |
160 |
161 | def setup_cdn_urls(use_default_cdn=False):
162 | from uniparser.fastapi_ui.views import cdn_urls
163 |
164 | if not Config.cdn_urls:
165 | # while cdn_urls not set, check use default cdn or static files.
166 | if use_default_cdn:
167 | # default online cdn
168 | Config.cdn_urls = {
169 | 'VUE_JS_CDN': 'https://cdn.staticfile.org/vue/2.6.11/vue.min.js',
170 | 'ELEMENT_CSS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/theme-chalk/index.css',
171 | 'ELEMENT_JS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/index.js',
172 | 'VUE_RESOURCE_CDN': 'https://cdn.staticfile.org/vue-resource/1.5.1/vue-resource.min.js',
173 | 'CLIPBOARDJS_CDN': 'https://cdn.staticfile.org/clipboard.js/2.0.4/clipboard.min.js',
174 | }
175 | else:
176 | # local statics
177 | Config.cdn_urls = {
178 | 'VUE_JS_CDN': '/static/js/vue.min.js',
179 | 'ELEMENT_CSS_CDN': '/static/css/index.css',
180 | 'ELEMENT_JS_CDN': '/static/js/index.js',
181 | 'VUE_RESOURCE_CDN': '/static/js/vue-resource.min.js',
182 | 'CLIPBOARDJS_CDN': '/static/js/clipboard.min.js',
183 | }
184 | # overwrite uniparser's cdn
185 | cdn_urls.update(Config.cdn_urls)
186 |
187 |
188 | def setup_lru_cache():
189 | Config._md5 = lru_cache(maxsize=Config.md5_cache_maxsize)(Config._md5)
190 | Config.get_sign = lru_cache(maxsize=Config.sign_cache_maxsize)(
191 | Config.get_sign)
192 |
193 |
194 | def setup(use_default_cdn=False):
195 | setup_logger()
196 | setup_lru_cache()
197 | setup_cdn_urls(use_default_cdn=use_default_cdn)
198 | setup_models()
199 |
200 |
201 | async def setup_md5_salt():
202 | logger = Config.logger
203 | exist_salt = await Config.metas.get('md5_salt', None)
204 | if not Config.md5_salt:
205 | if exist_salt:
206 | # no need to update
207 | Config.md5_salt = exist_salt
208 | return
209 | else:
210 | # create new salt
211 | from uuid import uuid1
212 | Config.md5_salt = uuid1().hex
213 | elif Config.md5_salt == exist_salt:
214 | # no need to update
215 | return
216 | # need to update: new md5_salt from settings, or no exist_salt
217 | logger.critical(f'Setting md5_salt as {Config.md5_salt}, replaced into db.')
218 | return await Config.metas.set('md5_salt', Config.md5_salt)
219 |
220 |
221 | async def setup_crawler():
222 | from uniparser import Crawler
223 |
224 | from .callbacks import CallbackHandler
225 |
226 | crawler = Crawler(uniparser=Config.uniparser, storage=Config.rule_db)
227 | Config.crawler = crawler
228 | if Config.callback_handler is None:
229 | Config.callback_handler = CallbackHandler()
230 | workers = ', '.join(Config.callback_handler.callbacks_dict.keys())
231 | Config.logger.info(f'Current online callbacks: {workers}')
232 |
233 |
234 | async def update_password(password=None):
235 | if password is not None:
236 | Config.password = password
237 | return await Config.metas.set('admin', Config.password)
238 |
239 |
240 | async def refresh_token():
241 | if Config.password:
242 | await update_password()
243 | password = Config.password
244 | else:
245 | password = await Config.metas.get('admin', '')
246 | if password:
247 | Config.watchdog_auth = md5(password)
248 |
249 |
250 | async def setup_background():
251 | from .background import background_loop, db_backup_handler
252 | from .crawler import crawl_once
253 | Config.background_funcs.append(crawl_once)
254 | if Config.db_backup_function:
255 | Config.background_funcs.append(db_backup_handler)
256 | Config.background_task = ensure_future(
257 | background_loop(Config.background_funcs))
258 |
259 |
260 | def setup_exception_handlers(app):
261 | for exc, callback in Config.exception_handlers:
262 | app.add_exception_handler(exc, callback)
263 |
264 |
265 | def mute_noise_logger():
266 | # uvicorn will set new handler for root logger and access logger after app launched.
267 | logging.getLogger('').handlers.clear()
268 | if Config.uvicorn_kwargs['access_log']:
269 | # fix https://github.com/encode/uvicorn/issues/523
270 | access_logger = logging.getLogger('uvicorn.access')
271 | access_logger.propagate = True
272 | access_logger.handlers.clear()
273 |
274 |
275 | async def setup_app(app):
276 | mute_noise_logger()
277 | db = Config.db
278 | if not db:
279 | raise RuntimeError('No database?')
280 | await db.connect()
281 | await setup_md5_salt()
282 | # refresh_token should be after setup_md5_salt
283 | await refresh_token()
284 | setup_exception_handlers(app)
285 | # 1
286 | await setup_uniparser()
287 | # 2
288 | await setup_crawler()
289 | # 3
290 | await setup_background()
291 | from . import __version__
292 | Config.logger.info(
293 | f'App started, the current version is {__version__}, CONFIG_DIR: {Config.CONFIG_DIR}'
294 | )
295 |
296 |
297 | async def release_app(app):
298 | Config.is_shutdown = True
299 | if Config.background_task and not Config.background_task.done():
300 | Config.background_task.cancel()
301 | if Config.db:
302 | await Config.db.disconnect()
303 |
304 |
305 | async def default_db_backup_sqlite():
306 | current_time = datetime.now().strftime('%Y%m%d%H%M%S')
307 | for storage_path in Config.CONFIG_DIR.iterdir():
308 | if storage_path.name == 'storage.sqlite':
309 | import shutil
310 | from pathlib import Path
311 | backup_dir: Path = ensure_dir(Config.CONFIG_DIR / 'backups')
312 | backup_path = backup_dir / f'storage-{current_time}.sqlite'
313 | # 3.6 has no get_running_loop
314 | loop = get_event_loop()
315 | # wait for copy
316 | future = loop.run_in_executor(None, shutil.copy, str(storage_path),
317 | str(backup_path))
318 | await future
319 | # remove overdue files
320 | backup_file_paths = sorted([i for i in backup_dir.iterdir()],
321 | key=lambda path: path.name,
322 | reverse=True)
323 | path_to_del = backup_file_paths[Config.db_backup_count:]
324 | for p in path_to_del:
325 | p.unlink()
326 |
327 |
328 | def get_host_freq_list(host):
329 | freq = Uniparser._HOST_FREQUENCIES.get(host, None)
330 | if freq:
331 | return [freq.n, freq.interval]
332 | else:
333 | return [None, 0]
334 |
335 |
336 | async def set_host_freq(host, n, interval):
337 | if n:
338 | Uniparser._HOST_FREQUENCIES[host] = AsyncFrequency(n, interval)
339 | else:
340 | Uniparser._HOST_FREQUENCIES.pop(host, None)
341 | await save_host_freqs()
342 |
343 |
344 | async def save_host_freqs():
345 | items = {
346 | host: freq.to_list()
347 | for host, freq in Uniparser._HOST_FREQUENCIES.items()
348 | }
349 | await Config.metas.set('host_freqs', dumps(items))
350 |
351 |
352 | async def load_host_freqs():
353 | host_freqs_str = await Config.metas.get('host_freqs', default='{}')
354 | host_freqs = loads(host_freqs_str)
355 | Uniparser._HOST_FREQUENCIES = {
356 | host: AsyncFrequency(*args) for host, args in host_freqs.items()
357 | }
358 |
--------------------------------------------------------------------------------
/watchdogs/static/js/vue-resource.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * vue-resource v1.5.1
3 | * https://github.com/pagekit/vue-resource
4 | * Released under the MIT License.
5 | */
6 |
7 | !function(t,e){"object"==typeof exports&&"undefined"!=typeof module?module.exports=e():"function"==typeof define&&define.amd?define(e):t.VueResource=e()}(this,function(){"use strict";function u(t){this.state=2,this.value=void 0,this.deferred=[];var e=this;try{t(function(t){e.resolve(t)},function(t){e.reject(t)})}catch(t){e.reject(t)}}u.reject=function(n){return new u(function(t,e){e(n)})},u.resolve=function(n){return new u(function(t,e){t(n)})},u.all=function(s){return new u(function(n,t){var o=0,r=[];function e(e){return function(t){r[e]=t,(o+=1)===s.length&&n(r)}}0===s.length&&n(r);for(var i=0;i Tuple[bool, datetime]:
48 | '''
49 | Three kinds of format:
50 |
51 | 1. Tow numbers splited by ', ', as work_hours:
52 | 0, 24 means from 00:00 ~ 23:59, for everyday
53 | 2. JSON list of int, as work_hours:
54 | [1, 19] means 01:00~01:59 a.m. 07:00~07:59 p.m. for everyday
55 | 3. Standard strftime format, as work_days:
56 | > Split work_hours by '==', then check
57 | if datetime.now().strftime(wh[0]) == wh[1]
58 | %A==Friday means each Friday
59 | %m-%d==03-13 means every year 03-13
60 | %H==05 means everyday morning 05:00 ~ 05:59
61 | 4. Mix up work_days and work_hours:
62 | > Split work_days and work_hours with ';'/'&' => 'and', '|' => 'or'.
63 | > Support == for equal, != for unequal.
64 | %w==5;20, 24 means every Friday 20:00 ~ 23:59
65 | [1, 2, 15];%w==5 means every Friday 1 a.m. 2 a.m. 3 p.m., the work_hours is on the left side.
66 | %w==5|20, 24 means every Friday or everyday 20:00 ~ 23:59
67 | %w==5|%w==2 means every Friday or Tuesday
68 | %w!=6&%w!=0 means everyday except Saturday & Sunday.
69 | 5. Set a ensure change interval
70 | > If work_hours string endswith `#` and `x` seconds, will check the next_change_time first.
71 | > In other words, I am very sure that the interval between two changes is more than `x` seconds
72 | > So the crawler of this task will not run until the time is `last_change_time + change_interval`
73 | %w==5#86400 means every Friday if it didn't change within 1 day
74 | 0, 24#3600 means each hour if it didn't change within this hour. The task will only be crawled once if it has changed.
75 | '''
76 | # find the latest hour fit work_hours, if not exist, return next day 00:00
77 | now = now or datetime.now()
78 | work_hours = task.work_hours or '0, 24'
79 | if '#' in work_hours:
80 | # check if changed
81 | last_change_time = task.last_change_time or datetime.fromtimestamp(0)
82 | # split work_hours and change_interval
83 | work_hours, change_interval_str = work_hours.split('#')
84 | change_interval = int(change_interval_str)
85 | # not fit change interval, will wait for left seconds.
86 | next_change_time = last_change_time + timedelta(seconds=change_interval)
87 | if now < next_change_time:
88 | Config.logger.info(
89 | f'Task [{task.name}] has changed in {timeago(change_interval, accuracy=1, format=1, short_name=1)} ago.'
90 | )
91 | return False, next_change_time
92 |
93 | need_crawl = check_work_time(work_hours, now)
94 | if need_crawl:
95 | # current time is need_crawl, next_check_time is now+interval
96 | next_check_time = now + timedelta(seconds=task.interval)
97 | return need_crawl, next_check_time
98 | else:
99 | # current time is not need_crawl
100 | next_check_time = now
101 | # time machine to update next_check_time fast
102 | for _ in range(60):
103 | # next interval
104 | next_check_time = next_check_time + timedelta(seconds=task.interval)
105 | _need_crawl = check_work_time(work_hours, next_check_time)
106 | if _need_crawl:
107 | # current time is still False, but next_check_time is True
108 | break
109 | return need_crawl, next_check_time
110 |
111 |
112 | async def crawl(task: Task):
113 | crawler: Crawler = Config.crawler
114 | logger = Config.logger
115 | logger.info(f'Start crawling: {task.name}')
116 | crawl_result = await try_catch(crawler.acrawl, task.request_args)
117 | error = ''
118 | if isinstance(crawl_result, RuleNotFoundError):
119 | error = repr(crawl_result)
120 | logger.error(f'{task.name}: {error}')
121 | result_list = [{"text": error}]
122 | elif isinstance(crawl_result, BaseException):
123 | error = getattr(crawl_result, 'text', repr(crawl_result))
124 | logger.error(f'{task.name}: {error}')
125 | result_list = None
126 | else:
127 | if len(crawl_result) == 1:
128 | # crawl_result schema: {rule_name: list_or_dict}
129 | formated_result = get_watchdog_result(
130 | item=crawl_result.popitem()[1])
131 | if formated_result == {'text': 'text not found'}:
132 | error = f'{task.name} text not found, crawl result given: {crawl_result}'
133 | logger.error(error)
134 | result_list = None
135 | else:
136 | if isinstance(formated_result, list):
137 | result_list = formated_result
138 | else:
139 | result_list = [formated_result]
140 | # use force crawl one web UI for more log
141 | logger.info(f'{task.name} Crawl success: {result_list}'[:150])
142 | else:
143 | error = 'Invalid crawl_result against schema {rule_name: [{"text": "Required", "url": "Optional", "key": "Optional", "unique": "Optional"}]}, given is %r' % crawl_result
144 | logger.error(f'{task.name}: {error}')
145 | result_list = [{"text": error}]
146 | return task, error, result_list
147 |
148 |
149 | async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
150 | """task_name means force crawl"""
151 | db: Database = Config.db
152 | now = datetime.now()
153 | logger = Config.logger
154 | logger.info(f'crawl_once task_name={task_name} start.')
155 | # sqlite do not has datediff...
156 | if task_name:
157 | query = tasks.select().where(tasks.c.name == task_name)
158 | else:
159 | query = tasks.select().where(tasks.c.enable == 1).where(
160 | tasks.c.next_check_time <= now)
161 | query = query.limit(chunk_size)
162 | todo = []
163 | now = datetime.now()
164 | update_values = []
165 | CLEAR_CACHE_NEEDED = False
166 | fetched_tasks = await db.fetch_all(query=query)
167 | has_more = len(fetched_tasks) >= chunk_size
168 | for _task in fetched_tasks:
169 | task = Task(**dict(_task))
170 | # check work hours
171 | need_crawl, next_check_time = find_next_check_time(task, now)
172 | if task_name:
173 | # always crawl for given task_name
174 | need_crawl = True
175 | if need_crawl:
176 | t = ensure_future(crawl(task))
177 | # add task_name for logger
178 | setattr(t, 'task_name', task.name)
179 | todo.append(t)
180 | # update next_check_time
181 | values = {
182 | 'last_check_time': now,
183 | 'next_check_time': next_check_time,
184 | 'task_id': task.task_id
185 | }
186 | # update task variable for callback
187 | task.__dict__.update(values)
188 | update_values.append(values)
189 | if not need_crawl:
190 | logger.info(
191 | f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}'
192 | )
193 | update_query = 'update tasks set `last_check_time`=:last_check_time,`next_check_time`=:next_check_time where task_id=:task_id'
194 | await db.execute_many(query=update_query, values=update_values)
195 | if update_values:
196 | CLEAR_CACHE_NEEDED = True
197 | logger.info(f'crawl_once crawling {len(todo)} valid tasks.')
198 | if todo:
199 | crawl_errors = []
200 | done, pending = await wait(todo, timeout=Config.default_crawler_timeout)
201 | if pending:
202 | names = [getattr(t, 'task_name', None) for t in pending]
203 | logger.error(f'crawl timeout {len(names)}: {names}')
204 | for _pending in pending:
205 | crawl_errors.append({
206 | 'task_id': task.task_id,
207 | 'error': 'timeout(%s)' % Config.default_crawler_timeout
208 | })
209 | _pending.cancel()
210 | ttime_now = ttime()
211 | changed_tasks = []
212 | update_counts = 0
213 | for t in done:
214 | task, error, result_list = t.result()
215 | if error != task.error:
216 | crawl_errors.append({'task_id': task.task_id, 'error': error})
217 | if error or result_list is None:
218 | # ignore update this task
219 | continue
220 | # compare latest_result and new list
221 | # later first, just like the saved result_list sortings
222 | old_latest_result = loads(task.latest_result)
223 | # try to use the key, or it self
224 | try:
225 | old_result_list = loads(
226 | task.result_list) if task.result_list else []
227 | except JSONDecodeError:
228 | old_result_list = []
229 | if old_latest_result.get('unique', True):
230 | # unique mode will skip all the Duplicated results
231 | exist_keys = {
232 | get_result_key(_old_result['result'])
233 | for _old_result in old_result_list
234 | }
235 | else:
236 | old_latest_result_key = get_result_key(old_latest_result)
237 | exist_keys = {old_latest_result_key}
238 | # list of dict
239 | to_insert_result_list = []
240 | for result in result_list:
241 | result_key = get_result_key(result)
242 | if result_key in exist_keys:
243 | break
244 | to_insert_result_list.append(result)
245 | if to_insert_result_list:
246 | # update db
247 | update_counts += 1
248 | # new result updated
249 | query = UpdateTaskQuery(task.task_id)
250 | # JSON
251 | new_latest_result = dumps(to_insert_result_list[0],
252 | sort_keys=True)
253 | query.add('latest_result', new_latest_result)
254 | query.add('last_change_time', now)
255 | # older insert first, keep the newer is on the top
256 | new_seeds = []
257 | for result in to_insert_result_list[::-1]:
258 | # result is dict, not json string
259 | old_result_list.insert(0, {
260 | 'result': result,
261 | 'time': ttime_now
262 | })
263 | new_seeds.append(result)
264 | await save_feed(new_seeds, db, task)
265 | new_result_list = dumps(old_result_list[:task.max_result_count])
266 | query.add('result_list', new_result_list)
267 | logger.info(f'[Updated] {task.name}. +++')
268 | await db.execute(**query.kwargs)
269 | task.latest_result = new_latest_result
270 | task.last_change_time = now
271 | task.result_list = new_result_list
272 | changed_tasks.append(task)
273 | if crawl_errors:
274 | update_query = 'update tasks set `error`=:error where task_id=:task_id'
275 | await db.execute_many(query=update_query, values=crawl_errors)
276 | logger.info(
277 | f'Crawl task_name={task_name} finished. Crawled: {len(done)}, Error: {len(crawl_errors)}, Timeout: {len(pending)}, Update: {update_counts}.{" +++" if update_counts else ""}'
278 | )
279 | for task in changed_tasks:
280 | ensure_future(try_catch(Config.callback_handler.callback, task))
281 | query_feeds.cache_clear()
282 | else:
283 | logger.info(f'Crawl task_name={task_name} finished. 0 todo.')
284 | if CLEAR_CACHE_NEEDED:
285 | logger.info('Clear cache for crawling new tasks.')
286 | query_tasks.cache_clear()
287 | if task_name:
288 | query = tasks.select().where(tasks.c.name == task_name)
289 | _task = await db.fetch_one(query=query)
290 | return dict(_task)
291 | else:
292 | return has_more
293 |
294 |
295 | async def crawl_once(task_name: Optional[str] = None):
296 | if task_name is not None:
297 | return await _crawl_once(task_name)
298 | with solo:
299 | result = await try_catch(_crawl_once, task_name)
300 | return result
301 |
302 |
303 | async def save_feed(new_seeds, db, task):
304 | if not new_seeds:
305 | return
306 | try:
307 | values = []
308 | for result in new_seeds:
309 | value = {
310 | 'task_id': task.task_id,
311 | 'name': task.name,
312 | 'text': result.get('title') or result.get('text') or '',
313 | 'url': result.get('url') or task.origin_url,
314 | 'ts_create': datetime.now(),
315 | }
316 | values.append(value)
317 |
318 | query = "INSERT INTO feeds (`task_id`, `name`, `text`, `url`, `ts_create`) values (:task_id, :name, :text, :url, :ts_create)"
319 | result = await db.execute_many(query=query, values=values)
320 | Config.logger.info(
321 | f'Insert task seeds success({task.name}): ({len(values)})')
322 | return result
323 | except Exception:
324 | Config.logger.error(
325 | f'Inserting task seeds failed({task.name}): {format_exc()}')
326 |
--------------------------------------------------------------------------------
/watchdogs/models.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datetime import datetime
3 | from traceback import format_exc
4 | from typing import Iterable, List, Optional, Set, Tuple, Union
5 |
6 | import sqlalchemy
7 | from async_lru import alru_cache
8 | from databases import Database
9 | from pydantic import BaseModel
10 | from sqlalchemy.sql import text
11 | from uniparser import CrawlerRule, HostRule
12 | from uniparser.crawler import RuleStorage, get_host
13 |
14 | from .config import Config
15 |
16 | if Config.COLLATION is None:
17 | if Config.db_url.startswith('sqlite'):
18 | Config.COLLATION = None
19 | else:
20 | Config.COLLATION = 'utf8_unicode_ci'
21 |
22 | metadata = sqlalchemy.MetaData()
23 | date0 = datetime.strptime('1970-01-01 08:00:00', '%Y-%m-%d %H:%M:%S')
24 | # server_default works instead of default, issue: https://github.com/encode/databases/issues/72
25 | tasks = sqlalchemy.Table(
26 | "tasks",
27 | metadata,
28 | sqlalchemy.Column('task_id',
29 | sqlalchemy.Integer,
30 | primary_key=True,
31 | autoincrement=True),
32 | sqlalchemy.Column("name",
33 | sqlalchemy.String(64, collation=Config.COLLATION),
34 | nullable=False,
35 | index=True,
36 | unique=True),
37 | sqlalchemy.Column("enable",
38 | sqlalchemy.Integer,
39 | server_default=text('1'),
40 | nullable=False),
41 | sqlalchemy.Column("tag",
42 | sqlalchemy.String(128, collation=Config.COLLATION),
43 | server_default="default",
44 | nullable=False),
45 | sqlalchemy.Column("error", sqlalchemy.TEXT(collation=Config.COLLATION)),
46 | sqlalchemy.Column("request_args",
47 | sqlalchemy.TEXT(collation=Config.COLLATION),
48 | nullable=False),
49 | sqlalchemy.Column("origin_url",
50 | sqlalchemy.String(1024),
51 | nullable=False,
52 | server_default=""),
53 | sqlalchemy.Column("interval",
54 | sqlalchemy.Integer,
55 | server_default=text('300'),
56 | nullable=False),
57 | sqlalchemy.Column("work_hours",
58 | sqlalchemy.String(32),
59 | server_default='0, 24',
60 | nullable=False),
61 | sqlalchemy.Column("max_result_count",
62 | sqlalchemy.Integer,
63 | server_default=text('10'),
64 | nullable=False),
65 | sqlalchemy.Column("latest_result", sqlalchemy.TEXT),
66 | sqlalchemy.Column("result_list", sqlalchemy.TEXT), # JSON list
67 | sqlalchemy.Column("last_check_time",
68 | sqlalchemy.TIMESTAMP,
69 | server_default="1970-01-01 08:00:00",
70 | nullable=False),
71 | sqlalchemy.Column("next_check_time",
72 | sqlalchemy.TIMESTAMP,
73 | server_default="1970-01-01 08:00:00",
74 | nullable=False),
75 | sqlalchemy.Column("last_change_time",
76 | sqlalchemy.TIMESTAMP,
77 | server_default="1970-01-01 08:00:00",
78 | index=True,
79 | nullable=False),
80 | sqlalchemy.Column("custom_info",
81 | sqlalchemy.TEXT(collation=Config.COLLATION)),
82 | )
83 | host_rules = sqlalchemy.Table(
84 | "host_rules",
85 | metadata,
86 | sqlalchemy.Column('host', sqlalchemy.String(128), primary_key=True),
87 | sqlalchemy.Column('host_rule', sqlalchemy.TEXT),
88 | )
89 | metas = sqlalchemy.Table(
90 | "metas",
91 | metadata,
92 | sqlalchemy.Column('key',
93 | sqlalchemy.String(64, collation=Config.COLLATION),
94 | primary_key=True),
95 | sqlalchemy.Column('value', sqlalchemy.TEXT(collation=Config.COLLATION)),
96 | )
97 | feeds = sqlalchemy.Table(
98 | "feeds",
99 | metadata,
100 | sqlalchemy.Column('id',
101 | sqlalchemy.Integer,
102 | primary_key=True,
103 | autoincrement=True),
104 | sqlalchemy.Column('task_id', sqlalchemy.Integer, nullable=False),
105 | sqlalchemy.Column("name",
106 | sqlalchemy.String(64, collation=Config.COLLATION),
107 | nullable=False),
108 | # sqlalchemy.Column("tag",
109 | # sqlalchemy.String(128, collation=Config.COLLATION),
110 | # server_default="default",
111 | # nullable=False),
112 | sqlalchemy.Column("text", sqlalchemy.TEXT),
113 | sqlalchemy.Column("url",
114 | sqlalchemy.String(1024),
115 | nullable=False,
116 | server_default=""),
117 | sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False),
118 | )
119 | groups = sqlalchemy.Table(
120 | "groups",
121 | metadata,
122 | sqlalchemy.Column('id',
123 | sqlalchemy.Integer,
124 | primary_key=True,
125 | autoincrement=True),
126 | sqlalchemy.Column("name",
127 | sqlalchemy.String(64, collation=Config.COLLATION),
128 | nullable=False),
129 | sqlalchemy.Column("task_ids", sqlalchemy.TEXT),
130 | # sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False),
131 | )
132 |
133 |
134 | def create_tables(db_url):
135 | try:
136 | engine = sqlalchemy.create_engine(db_url)
137 | metadata.create_all(engine)
138 | except BaseException:
139 | Config.logger.critical(f'Fatal error on creating Table: {format_exc()}')
140 | import os
141 | os._exit(1)
142 |
143 |
144 | class RuleStorageDB(RuleStorage):
145 |
146 | def __init__(self, db):
147 | self.db = db
148 | self.logger = Config.logger
149 |
150 | async def commit(self):
151 | pass
152 |
153 | async def get_host_rule(self, host: str, default=None):
154 | query = "SELECT host_rule FROM host_rules WHERE host = :host"
155 | host_rule = await self.db.fetch_one(query=query, values={"host": host})
156 | if host_rule:
157 | return HostRule.loads(host_rule[0])
158 | else:
159 | return default
160 |
161 | async def find_crawler_rule(self, url, method='find') -> CrawlerRule:
162 | if not url:
163 | return None
164 | host = get_host(url)
165 | host_rule = await self.get_host_rule(host)
166 | if host_rule:
167 | return host_rule.find(url)
168 |
169 | async def add_crawler_rule(self, rule: CrawlerRule, commit=None):
170 | if isinstance(rule, str):
171 | rule = CrawlerRule.loads(rule)
172 | elif isinstance(rule, dict) and not isinstance(rule, CrawlerRule):
173 | rule = CrawlerRule(**rule)
174 | if not rule.get('regex'):
175 | raise ValueError('regex should not be null')
176 | url = rule.get('request_args', {}).get('url')
177 | if not url:
178 | self.logger.error(f'[Rule] {rule["name"]} not found url.')
179 | return False
180 | host = get_host(url)
181 | if not host:
182 | return False
183 | exist_host_rule = await self.get_host_rule(host)
184 | if exist_host_rule:
185 | exist_host_rule.add_crawler_rule(rule)
186 | query = "update host_rules set host_rule=:host_rule_string WHERE host = :host"
187 | return await self.db.execute(
188 | query=query,
189 | values={
190 | 'host_rule_string': exist_host_rule.dumps(),
191 | 'host': host
192 | })
193 | else:
194 | host_rule = HostRule(host)
195 | host_rule.add_crawler_rule(rule)
196 | query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)"
197 | return await self.db.execute(
198 | query=query,
199 | values={
200 | 'host_rule_string': host_rule.dumps(),
201 | 'host': host
202 | })
203 |
204 | async def pop_crawler_rule(self, rule: CrawlerRule, commit=False):
205 | query = "SELECT host_rule FROM host_rules"
206 | host = get_host(rule['request_args'].get('url'))
207 | values = {}
208 | if host:
209 | query += ' WHERE host = :host'
210 | values['host'] = host
211 | rows = await self.db.fetch_all(query=query, values=values)
212 | for row in rows:
213 | host_rule = HostRule.loads(row.host_rule)
214 | crawler_rule = host_rule.pop_crawler_rule(rule['name'])
215 | if crawler_rule:
216 | # update host_rule
217 | await self.add_host_rule(host_rule)
218 | return crawler_rule
219 |
220 | async def add_host_rule(self, rule: HostRule, commit=None):
221 | """insert or update HostRule"""
222 | # some sql not support upsert: insert replace, replace into, on conflict
223 | query = "SELECT host_rule FROM host_rules WHERE host = :host"
224 | exist_host_rule = await self.get_host_rule(rule['host'])
225 | if exist_host_rule:
226 | query = "update host_rules set host_rule=:host_rule_string WHERE host = :host"
227 | return await self.db.execute(query=query,
228 | values={
229 | 'host_rule_string': rule.dumps(),
230 | 'host': rule['host']
231 | })
232 | else:
233 | query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)"
234 | return await self.db.execute(query=query,
235 | values={
236 | 'host_rule_string': rule.dumps(),
237 | 'host': rule['host']
238 | })
239 |
240 | async def pop_host_rule(self, host: str, commit=None):
241 | exist_host_rule = await self.get_host_rule(host)
242 | host_rule = HostRule.loads(exist_host_rule) if exist_host_rule else None
243 | if host_rule:
244 | query = "delete FROM host_rules WHERE host = :host"
245 | await self.db.execute(query=query, values={'host': host})
246 | return host_rule
247 |
248 |
249 | class Task(BaseModel):
250 | task_id: Optional[int] = None
251 | name: str
252 | enable: int = 0
253 | tag: str = 'default'
254 | error: str = ''
255 | request_args: str
256 | origin_url: str = ''
257 | interval: int = 300
258 | work_hours: str = '0, 24'
259 | max_result_count: int = 30
260 | latest_result: str = '{}'
261 | result_list = '[]'
262 | last_check_time: datetime = date0
263 | next_check_time: datetime = date0
264 | last_change_time: datetime = date0
265 | custom_info: str = ''
266 |
267 |
268 | class Group(BaseModel):
269 | id: Optional[int] = None
270 | name: str = ''
271 | task_ids: str = ''
272 |
273 |
274 | class Feed(BaseModel):
275 | task_id: int
276 | name: str
277 | text: str
278 | url: str
279 | ts_create: datetime
280 |
281 |
282 | class Metas(object):
283 | """Save & Load some variables with db"""
284 |
285 | def __init__(self, db: Database):
286 | self.db = db
287 |
288 | async def set(self, key, value):
289 | query = 'replace into metas (`key`, `value`) values (:key, :value)'
290 | await Config.db.execute(query, values={'key': key, 'value': value})
291 | self.clear_cache()
292 | if (await self.get(key)) == value:
293 | return True
294 | else:
295 | return False
296 |
297 | async def remove(self, key):
298 | query = 'delete from metas where `key`=:key'
299 | await Config.db.execute(query, values={'key': key})
300 | self.clear_cache()
301 | if not (await self.get(key)):
302 | return True
303 | else:
304 | return False
305 |
306 | @alru_cache(maxsize=Config.metas_cache_maxsize)
307 | async def _get(self, key, default=None):
308 | query = 'select `value` from metas where `key`=:key'
309 | result = await self.db.fetch_one(query, values={'key': key})
310 | if result:
311 | return result.value
312 | else:
313 | return default
314 |
315 | async def get(self, key, default=None, cache=True):
316 | if not cache:
317 | self.clear_cache()
318 | return await self._get(key, default=default)
319 |
320 | def clear_cache(self):
321 | self._get.cache_clear()
322 |
323 |
324 | @alru_cache(maxsize=Config.query_tasks_cache_maxsize)
325 | async def query_tasks(
326 | task_name: Optional[str] = None,
327 | task_id: Optional[int] = None,
328 | page: int = 1,
329 | page_size: int = Config.default_page_size,
330 | order_by: str = 'last_change_time',
331 | sort: str = 'desc',
332 | tag: str = '',
333 | task_ids: Tuple[int] = None,
334 | ) -> Tuple[List[dict], bool]:
335 | # task_ids arg type is tuple for cache hashing
336 | offset = page_size * (page - 1)
337 | query = tasks.select()
338 | if task_ids:
339 | query = query.where(tasks.c.task_id.in_(task_ids))
340 | else:
341 | if task_id is not None:
342 | query = query.where(tasks.c.task_id == task_id)
343 | if task_name is not None:
344 | query = query.where(tasks.c.name == task_name)
345 | if tag:
346 | query = query.where(tasks.c.tag == tag)
347 | if order_by and sort:
348 | ob = getattr(tasks.c, order_by, None)
349 | if ob is None:
350 | raise ValueError(f'bad order_by {order_by}')
351 | if sort.lower() == 'desc':
352 | ob = sqlalchemy.desc(ob)
353 | elif sort.lower() == 'asc':
354 | ob = sqlalchemy.asc(ob)
355 | else:
356 | raise ValueError(
357 | f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')")
358 | query = query.order_by(ob)
359 | query = query.limit(page_size + 1).offset(offset)
360 | _result = await Config.db.fetch_all(query=query)
361 | has_more = len(_result) > page_size
362 | result = [dict(i) for i in _result][:page_size]
363 | query_string = str(query.compile(
364 | compile_kwargs={"literal_binds": True})).replace('\n', '')
365 | Config.logger.info(
366 | f'[Query] {len(result)} tasks (has_more={has_more}): {query_string}')
367 | return result, has_more
368 |
369 |
370 | @alru_cache(maxsize=Config.query_task_ids_cache_maxsize)
371 | async def query_task_ids(task_name: Optional[str] = None,
372 | tag: str = '') -> List[int]:
373 | query = tasks.select()
374 | if task_name is not None:
375 | query = query.where(tasks.c.name == task_name)
376 | if tag:
377 | query = query.where(tasks.c.tag == tag)
378 | _result = await Config.db.fetch_all(query=query)
379 | result = [dict(i)['task_id'] for i in _result]
380 | query_string = str(query.compile(
381 | compile_kwargs={"literal_binds": True})).replace('\n', '')
382 | Config.logger.info(f'[Query] {len(result)} task ids: {query_string}')
383 | return result
384 |
385 |
386 | @alru_cache(maxsize=Config.query_group_task_ids_cache_maxsize)
387 | async def query_group_task_ids(
388 | group_id: int = None,
389 | group_ids: Union[str, Tuple[int]] = None,
390 | ) -> List[int]:
391 | _group_ids: Set[int] = set()
392 | if group_id:
393 | _group_ids.add(int(group_id))
394 | if group_ids:
395 | if isinstance(group_ids, str):
396 | for _group_id in re.findall(r'\d+', group_ids):
397 | _group_ids.add(int(_group_id))
398 | elif isinstance(group_ids, tuple):
399 | _group_ids.add(int(_group_id))
400 | task_ids: Set[int] = set()
401 | for _group_id in _group_ids:
402 | query = groups.select()
403 | query = query.where(groups.c.id == _group_id)
404 | _result = await Config.db.fetch_one(query=query)
405 | if _result:
406 | task_ids_str = dict(_result).get('task_ids') or ''
407 | for task_id in re.findall(r'\d+', task_ids_str):
408 | task_ids.add(int(task_id))
409 | query_string = str(query.compile(
410 | compile_kwargs={"literal_binds": True})).replace('\n', '')
411 | Config.logger.info(
412 | f'[Query] {len(task_ids)} task_ids by group {group_id or group_ids}: {query_string}'
413 | )
414 | return list(task_ids)
415 |
416 |
417 | @alru_cache(maxsize=Config.query_feeds_cache_maxsize)
418 | async def query_feeds(
419 | task_name: Optional[str] = None,
420 | task_id: Optional[int] = None,
421 | page: int = 1,
422 | page_size: int = Config.default_page_size,
423 | order_by: str = 'id',
424 | sort: str = 'desc',
425 | tag: str = '',
426 | task_ids: Tuple[int] = None,
427 | ) -> Tuple[List[dict], bool]:
428 | # task_ids arg type is tuple for cache hashing
429 | offset = page_size * (page - 1)
430 | query = feeds.select()
431 | _task_ids: List[int] = []
432 | if task_ids:
433 | _task_ids.extend(task_ids)
434 | if tag:
435 | _task_ids += await query_task_ids(tag=tag)
436 | if _task_ids:
437 | query = query.where(feeds.c.task_id.in_(tuple(_task_ids)))
438 | else:
439 | if task_id is not None:
440 | query = query.where(feeds.c.task_id == task_id)
441 | if task_name is not None:
442 | query = query.where(feeds.c.name == task_name)
443 | if order_by and sort:
444 | ob = getattr(feeds.c, order_by, None)
445 | if ob is None:
446 | raise ValueError(f'bad order_by {order_by}')
447 | if sort.lower() == 'desc':
448 | ob = sqlalchemy.desc(ob)
449 | elif sort.lower() == 'asc':
450 | ob = sqlalchemy.asc(ob)
451 | else:
452 | raise ValueError(
453 | f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')")
454 | query = query.order_by(ob)
455 | query = query.limit(page_size + 1).offset(offset)
456 | _result = await Config.db.fetch_all(query=query)
457 | has_more = len(_result) > page_size
458 | result = [dict(i) for i in _result][:page_size]
459 | query_string = str(query.compile(
460 | compile_kwargs={"literal_binds": True})).replace('\n', '')
461 | Config.logger.info(
462 | f'[Query] {len(result)} feeds (has_more={has_more}): {query_string}')
463 | return result, has_more
464 |
465 |
466 | @alru_cache(maxsize=Config.query_groups_cache_maxsize)
467 | async def query_all_groups() -> List[dict]:
468 | query = groups.select()
469 | rows = await Config.db.fetch_all(query=query)
470 | result = []
471 | for row in rows:
472 | result.append(dict(row))
473 | query_string = str(query.compile(
474 | compile_kwargs={"literal_binds": True})).replace('\n', '')
475 | Config.logger.info(f'[Query] {len(result)} groups: {query_string}')
476 | return result
477 |
478 |
479 | async def query_task_errors(tag: str = '',
480 | task_ids: Iterable = None) -> List[dict]:
481 | query = tasks.select().with_only_columns(tasks.c.name, tasks.c.error)
482 | if task_ids:
483 | query = query.where(tasks.c.task_id.in_(tuple(task_ids)))
484 | query = query.where(tasks.c.error != '')
485 | query = query.where(tasks.c.enable == 1)
486 | if tag:
487 | query = query.where(tasks.c.tag == tag)
488 | query = query.order_by(sqlalchemy.desc('last_change_time'))
489 | _result: list = await Config.db.fetch_all(query=query)
490 | result = [dict(task) for task in _result]
491 | query_string = str(query.compile(
492 | compile_kwargs={"literal_binds": True})).replace('\n', '')
493 | Config.logger.info(f'[Query] {len(result)} task errors: {query_string}')
494 | return result
495 |
--------------------------------------------------------------------------------
/watchdogs/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Watchdogs v{{version}}
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | {{init_vars}}
22 |
23 |
24 |
25 |
26 |
27 |
30 |
31 |
32 |
33 |
34 |
35 | ${scope.row.name}
36 |
37 |
38 |
39 |
40 |
41 |
42 | ${scope.row.tag}
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | ${(scope.request_args_dict =
51 | JSON.parse(scope.row.request_args))&&(scope._url = new
52 | URL(scope.request_args_dict.url)) &&
53 | scope._url.host }
54 |
55 | ${JSON.stringify(scope.request_args_dict, null, 2)}
57 | Rule
59 | Host
61 |
62 | ${ scope._url.host }
63 |
64 | :
65 | ${scope._url}
67 |
68 |
69 |
70 |
71 |
72 |
73 |
75 |
76 |
78 | ${scope.row.timeago} ago
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
88 |
91 | ${get_latest_result(scope.row.latest_result, 80)}
92 |
93 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
105 |
106 |
109 |
112 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
153 |
154 |
155 |
156 |
158 |
159 |
161 | ParseRule's name should have these keys to generate RSS:
162 |
163 | text
164 |
165 | url [Optional]
166 |
167 |
168 |
169 |
170 | 1. Save Crawler Rule
171 |
172 | 2. Add New
173 | Task
174 |
175 |
176 |
177 |
178 |
179 |
180 |
182 | Host
183 |
184 |
185 |
186 |
187 |
188 |
190 | ${ host.name }
191 |
192 |
193 |
194 |
195 |
197 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
207 |
208 |
209 |
211 |
212 |
213 |
215 |
216 |
217 |
218 |
220 |
221 |
222 |
223 |
224 |
225 |
226 | Rule
228 |
229 | Host
230 |
231 |
232 | | "retry": 2, "timeout": 3, "ssl": false, and other args refer to
233 | aiohttp
235 |
236 |
238 |
239 |
240 |
242 |
243 |
244 | 5 mins
245 |
246 | |
247 | 10 mins
248 | |
249 | 30 mins
250 | |
251 | 1 hrs
252 | |
253 | 3 hrs
254 | |
255 | 6 hrs
256 | |
257 | 12 hrs
258 | |
259 | 1 day
260 | |
261 | 7 days
262 | |
263 | 30 days
264 |
266 |
267 |
268 |
269 |
271 |
272 |
273 |
274 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
284 | Callbacks:
285 |
286 | ${name||'default-callback'}
287 |
288 | ${current_cb_doc}
290 |
291 |
292 |
296 |
297 |
299 | Delete
300 |
301 |
302 | Frequency: Send [n] request
303 | each [interval] seconds
304 |
305 |
309 | n:
310 |
311 |
312 |
313 |
315 | interval:
316 |
317 |
318 |
319 | Update Frequency
320 |
321 |
322 |
323 |
324 | Name
325 |
326 |
327 | Regex
328 |
329 |
330 | /
331 |
332 |
333 |
334 |
335 |
336 |
338 | ${rule.name}
339 |
340 |
341 |
342 | ${rule.regex}
343 |
344 |
345 |
347 |
349 |
351 |
352 |
353 |
354 |
355 |
356 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
--------------------------------------------------------------------------------
/watchdogs/app.py:
--------------------------------------------------------------------------------
1 | from base64 import b64encode
2 | from collections import deque
3 | from datetime import datetime
4 | from json import JSONDecodeError, dumps, loads
5 | from pathlib import Path
6 | from typing import Optional
7 |
8 | import aiofiles
9 | from fastapi import Cookie, FastAPI, Header
10 | from fastapi.staticfiles import StaticFiles
11 | from starlette.requests import Request
12 | from starlette.responses import (
13 | FileResponse,
14 | HTMLResponse,
15 | JSONResponse,
16 | RedirectResponse,
17 | Response,
18 | )
19 | from starlette.templating import Jinja2Templates
20 | from torequests.utils import timeago, ttime
21 | from uniparser import CrawlerRule, Uniparser
22 | from uniparser.fastapi_ui import app as sub_app
23 | from uniparser.utils import get_host
24 |
25 | from . import __version__
26 | from .config import md5_checker
27 | from .crawler import crawl_once, find_next_check_time
28 | from .models import (
29 | Group,
30 | Task,
31 | groups,
32 | query_all_groups,
33 | query_feeds,
34 | query_group_task_ids,
35 | query_task_errors,
36 | query_tasks,
37 | tasks,
38 | )
39 | from .settings import (
40 | Config,
41 | get_host_freq_list,
42 | refresh_token,
43 | release_app,
44 | set_host_freq,
45 | setup_app,
46 | )
47 | from .utils import format_size, gen_rss
48 |
49 | description = "Watchdogs to keep an eye on the world's change.\nRead more: [https://github.com/ClericPy/watchdogs](https://github.com/ClericPy/watchdogs)"
50 | app = FastAPI(title="Watchdogs", description=description, version=__version__)
51 |
52 | Config.setup_middleware(app)
53 | sub_app.openapi_prefix = '/uniparser'
54 | app.mount("/uniparser", sub_app)
55 | app.mount("/static",
56 | StaticFiles(directory=str((Path(__file__).parent /
57 | 'static').absolute())),
58 | name="static")
59 | logger = Config.logger
60 | templates = Jinja2Templates(directory=str((Path(__file__).parent /
61 | 'templates').absolute()))
62 |
63 |
64 | @app.on_event("startup")
65 | async def startup():
66 | await setup_app(app)
67 |
68 |
69 | @app.on_event("shutdown")
70 | async def shutdown():
71 | await release_app(app)
72 |
73 |
74 | @app.post('/auth')
75 | async def post_auth(request: Request,
76 | watchdog_auth: str = Cookie(''),
77 | redirect: str = '/'):
78 | # two scene for set new password, update new password if has password, else return the html
79 | # 1. not set watchdog_auth; 2. already authenticated
80 | password = loads(await request.body())['password']
81 | auth_not_set = not Config.watchdog_auth
82 | already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth
83 | need_new_pwd = auth_not_set or already_authed
84 | if password:
85 | if need_new_pwd:
86 | old_password = Config.password
87 | Config.password = password
88 | await refresh_token()
89 | resp = JSONResponse({'ok': True, 'redirect': redirect})
90 | resp.set_cookie('watchdog_auth',
91 | Config.watchdog_auth,
92 | max_age=Config.cookie_max_age,
93 | httponly=True)
94 | logger.warning(
95 | f'password changed {old_password}->{Config.password}.')
96 | return resp
97 | elif (await md5_checker(password, Config.watchdog_auth, freq=True)):
98 | resp = JSONResponse({'ok': True, 'redirect': redirect})
99 | resp.set_cookie('watchdog_auth',
100 | Config.watchdog_auth,
101 | max_age=Config.cookie_max_age,
102 | httponly=True)
103 | logger.info('correct password, login success.')
104 | return resp
105 | # invalid password, clear cookie
106 | resp = JSONResponse({'ok': False})
107 | # resp.set_cookie('watchdog_auth', '')
108 | resp.delete_cookie('watchdog_auth')
109 | logger.info(f'invalid password: {password}')
110 | return resp
111 |
112 |
113 | @app.get('/auth')
114 | async def auth(request: Request,
115 | watchdog_auth: str = Cookie(''),
116 | redirect: str = '/'):
117 | auth_not_set = not Config.watchdog_auth
118 | already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth
119 | need_new_pwd = auth_not_set or already_authed
120 | context: dict = {'request': request}
121 | context['version'] = __version__
122 | if need_new_pwd:
123 | context['action'] = 'Init'
124 | context['prompt_title'] = 'Set a new password'
125 | else:
126 | context['action'] = 'Login'
127 | context['prompt_title'] = 'Input the password'
128 | return templates.TemplateResponse("auth.html", context=context)
129 |
130 |
131 | @app.get("/")
132 | async def index(request: Request, tag: str = ''):
133 | kwargs: dict = {'request': request}
134 | kwargs['cdn_urls'] = Config.cdn_urls
135 | kwargs['version'] = __version__
136 | kwargs['rss_url'] = Config.get_route('/rss', tag=tag)
137 | kwargs['lite_url'] = Config.get_route('/lite', tag=tag)
138 | kwargs['feeds_url'] = Config.get_route('/feeds', tag=tag)
139 | kwargs['rss_feeds_url'] = Config.get_route('/rss_feeds', tag=tag)
140 | init_vars_json = dumps({
141 | 'custom_links': Config.custom_links,
142 | 'callback_workers': Config.callback_handler.workers,
143 | 'custom_tabs': Config.custom_tabs,
144 | 'work_hours_doc': find_next_check_time.__doc__,
145 | })
146 | init_vars_b64 = b64encode(init_vars_json.encode('u8')).decode('u8')
147 | kwargs['init_vars'] = init_vars_b64
148 | return templates.TemplateResponse("index.html", context=kwargs)
149 |
150 |
151 | @app.get("/favicon.ico")
152 | async def favicon():
153 | return RedirectResponse('/static/img/favicon.svg', 301)
154 |
155 |
156 | @app.post("/add_new_task")
157 | async def add_new_task(task: Task):
158 | try:
159 | if task.interval < 60:
160 | raise ValueError('interval should not less than 60 seconds.')
161 | db = Config.db
162 | # check exist
163 | if task.task_id is None:
164 | # insert new task
165 | query = tasks.insert()
166 | values = dict(task)
167 | if not values.get('error'):
168 | values['error'] = ''
169 | # insert with task_id is None
170 | await db.execute(query=query, values=values)
171 | else:
172 | # update old task
173 | query = 'update tasks set `name`=:name,`enable`=:enable,`tag`=:tag,`request_args`=:request_args,`origin_url`=:origin_url,`interval`=:interval,`work_hours`=:work_hours,`max_result_count`=:max_result_count,`custom_info`=:custom_info,`next_check_time`=:next_check_time where `task_id`=:task_id'
174 | values = {
175 | 'task_id': task.task_id,
176 | 'name': task.name,
177 | 'enable': task.enable,
178 | 'tag': task.tag,
179 | 'request_args': task.request_args,
180 | 'origin_url': task.origin_url,
181 | 'interval': task.interval,
182 | 'work_hours': task.work_hours,
183 | 'max_result_count': task.max_result_count,
184 | 'custom_info': task.custom_info,
185 | 'next_check_time': datetime.now(),
186 | }
187 | await db.execute(query=query, values=values)
188 | result = {'msg': 'ok'}
189 | query_tasks.cache_clear()
190 | except Exception as e:
191 | result = {'msg': repr(e)}
192 | logger.info(
193 | f'{"[Add]" if task.task_id is None else "[Update]"} task {task}: {result}'
194 | )
195 | return result
196 |
197 |
198 | @app.get("/delete_task")
199 | async def delete_task(task_id: int):
200 | try:
201 | query = tasks.delete().where(tasks.c.task_id == task_id)
202 | await Config.db.execute(query=query)
203 | result = {'msg': 'ok'}
204 | query_tasks.cache_clear()
205 | except Exception as e:
206 | result = {'msg': repr(e)}
207 | logger.info(f'[Delete] task {task_id}: {result}')
208 | return result
209 |
210 |
211 | @app.get("/force_crawl")
212 | async def force_crawl(task_name: str):
213 | try:
214 | task = await crawl_once(task_name=task_name)
215 | task['timeago'] = timeago(
216 | (datetime.now() - task['last_change_time']).total_seconds(),
217 | 1,
218 | 1,
219 | short_name=True)
220 | result = {'msg': 'ok', 'task': task}
221 | except Exception as e:
222 | result = {'msg': repr(e)}
223 | logger.info(f'[Force] crawl {task_name}: {result}')
224 | return result
225 |
226 |
227 | @app.get("/load_tasks")
228 | async def load_tasks(
229 | task_name: Optional[str] = None,
230 | page: int = 1,
231 | page_size: int = Config.default_page_size,
232 | order_by: str = 'last_change_time',
233 | sort: str = 'desc',
234 | tag: str = '',
235 | ):
236 | try:
237 | _result, has_more = await query_tasks(
238 | task_name=task_name,
239 | page=page,
240 | page_size=page_size,
241 | order_by=order_by,
242 | sort=sort,
243 | tag=tag,
244 | )
245 | _result = [task for task in _result]
246 | now = datetime.now()
247 | for item in _result:
248 | item['timeago'] = timeago(
249 | (now - item['last_change_time']).total_seconds(),
250 | 1,
251 | 1,
252 | short_name=True)
253 | result = {'msg': 'ok', 'tasks': _result, 'has_more': has_more}
254 | except Exception as e:
255 | result = {'msg': str(e), 'tasks': [], 'has_more': False}
256 | return result
257 |
258 |
259 | @app.get("/enable_task")
260 | async def enable_task(task_id: int, enable: int = 1):
261 | query = 'update tasks set `enable`=:enable where `task_id`=:task_id'
262 | values = {'task_id': task_id, 'enable': enable}
263 | try:
264 | _result = await Config.db.execute(query, values)
265 | result = {'msg': 'ok', 'updated': _result}
266 | query_tasks.cache_clear()
267 | except Exception as e:
268 | result = {'msg': repr(e)}
269 | return result
270 |
271 |
272 | @app.get('/load_hosts')
273 | async def load_hosts(host: str = ''):
274 | host = get_host(host) or host
275 | query = 'select `host` from host_rules'
276 | if host:
277 | query += ' where `host` like :host'
278 | values = {'host': f'%{host}%'}
279 | else:
280 | values = {}
281 | query += ' order by `host` asc'
282 | _result = await Config.db.fetch_all(query, values)
283 | host_freqs = Uniparser._HOST_FREQUENCIES
284 | hosts = [{
285 | 'name': getattr(i, 'host', None),
286 | 'freq': getattr(i, 'host', None) in host_freqs
287 | } for i in _result]
288 | return {'hosts': hosts, 'host': host}
289 |
290 |
291 | @app.get("/get_host_rule")
292 | async def get_host_rule(host: str):
293 | try:
294 | if not host:
295 | raise ValueError('host name should not be null')
296 | query = 'select `host_rule` from host_rules where `host`=:host'
297 | values = {'host': host}
298 | _result = await Config.db.fetch_one(query, values)
299 | host_rule = getattr(_result, 'host_rule', None)
300 | host_rule = loads(host_rule) if host_rule else {"host": host}
301 | host_rule['n'], host_rule['interval'] = get_host_freq_list(host)
302 | result = {'msg': 'ok', 'host_rule': host_rule}
303 | except Exception as e:
304 | result = {'msg': repr(e)}
305 | logger.info(f'[Get] host_rule {host}: {result}')
306 | return result
307 |
308 |
309 | @app.post("/crawler_rule.{method}")
310 | async def crawler_rule(method: str,
311 | rule: CrawlerRule,
312 | force: Optional[int] = 0):
313 | try:
314 | if not rule['name']:
315 | raise ValueError('rule name can not be null')
316 | if method == 'add':
317 | if force:
318 | exist_rule = await Config.rule_db.find_crawler_rule(
319 | rule['request_args']['url'])
320 | if exist_rule:
321 | logger.info(
322 | f'add crawler_rule force=1, old rule removed: {exist_rule}'
323 | )
324 | await Config.rule_db.pop_crawler_rule(exist_rule)
325 | _result = await Config.rule_db.add_crawler_rule(rule)
326 | elif method == 'pop':
327 | _result = await Config.rule_db.pop_crawler_rule(rule)
328 | else:
329 | raise ValueError('method only support add and pop')
330 | result = {'msg': 'ok', 'result': _result}
331 | except Exception as e:
332 | result = {'msg': repr(e)}
333 | logger.info(f'[{method.title()}] crawler rule {rule}: {result}')
334 | return result
335 |
336 |
337 | @app.post("/find_crawler_rule")
338 | async def find_crawler_rule(request_args: dict):
339 | try:
340 | url = request_args.get('url')
341 | rule: CrawlerRule = await Config.rule_db.find_crawler_rule(url)
342 | if not rule:
343 | raise ValueError(f'rule not found for given url: {url}')
344 | result = {'msg': 'ok', 'result': rule.dumps()}
345 | except Exception as e:
346 | result = {'msg': repr(e)}
347 | logger.info(f'[Find] crawler rule: {result}')
348 | return result
349 |
350 |
351 | @app.get("/delete_host_rule")
352 | async def delete_host_rule(host: str):
353 | try:
354 | if not host:
355 | raise ValueError('host should not be null')
356 | await Config.rule_db.pop_host_rule(host)
357 | result = {'msg': 'ok'}
358 | except Exception as e:
359 | result = {'msg': repr(e)}
360 | logger.info(f'[Delete] host rule {host}: {result}')
361 | return result
362 |
363 |
364 | @app.get("/log")
365 | async def log(request: Request,
366 | max_lines: int = 50,
367 | refresh_every: int = 0,
368 | log_names: str = 'info-server-error'):
369 | window: deque = deque((), max_lines)
370 | names: list = log_names.split('-')
371 | items = []
372 | for name in names:
373 | file_name = f'{name}.log'
374 | fp: Path = Config.CONFIG_DIR / file_name
375 | if not fp.is_file():
376 | continue
377 | fp_stat = fp.stat()
378 | file_size = format_size(fp_stat.st_size)
379 | st_mtime = ttime(fp_stat.st_mtime)
380 | line_no = 0
381 | async with aiofiles.open(fp, encoding=Config.ENCODING) as f:
382 | async for line in f:
383 | line_no += 1
384 | window.append(line)
385 | item = {
386 | 'name': name,
387 | 'line_no': line_no,
388 | 'file_size': file_size,
389 | 'st_mtime': st_mtime,
390 | 'log_text': "".join(window),
391 | 'file_size_mb': Config.LOGGING_FILE_CONFIG.get(file_name, {}).get(
392 | 'file_size_mb', '-1'),
393 | }
394 | items.append(item)
395 | window.clear()
396 | context = {
397 | 'request': request,
398 | 'items': items,
399 | 'log_names': log_names,
400 | 'refresh_every': refresh_every,
401 | 'max_lines': max_lines,
402 | }
403 | return templates.TemplateResponse("logs.html", context=context)
404 |
405 |
406 | @app.get("/log.clear")
407 | async def log_clear(log_names: str = 'info-server-error',
408 | current_names: str = 'info-server-error'):
409 | names: list = log_names.split('-')
410 | for name in names:
411 | fp: Path = Config.CONFIG_DIR / f'{name}.log'
412 | if not fp.is_file():
413 | continue
414 | # use sync writing to block the main thread
415 | fp.write_bytes(b'')
416 | logger.info(f'{name}.log cleared')
417 | html = f' {log_names} log cleared. Redirecting back.'
418 | return HTMLResponse(html)
419 |
420 |
421 | @app.get("/update_host_freq")
422 | async def update_host_freq(host: str,
423 | n: Optional[int] = 0,
424 | interval: Optional[int] = 0):
425 | try:
426 | if not host:
427 | raise ValueError('host should not be null')
428 | await set_host_freq(host, n=n, interval=interval)
429 | result = {'msg': 'ok'}
430 | except Exception as e:
431 | result = {'msg': repr(e)}
432 | logger.info(f'[Update] host frequency {host}: {result}')
433 | return result
434 |
435 |
436 | @app.get("/rss")
437 | async def rss(
438 | request: Request,
439 | tag: str = '',
440 | sign: str = '',
441 | host: str = Header('', alias='Host'),
442 | group_ids: str = '',
443 | ):
444 | if group_ids:
445 | task_ids = tuple(await query_group_task_ids(group_ids))
446 | if not task_ids:
447 | return JSONResponse(
448 | status_code=404,
449 | content={
450 | "message": 'query no tasks',
451 | },
452 | )
453 | tasks, _ = await query_tasks(task_ids=task_ids)
454 | else:
455 | tasks, _ = await query_tasks(tag=tag)
456 | source_link = f'https://{host}'
457 | xml_data: dict = {
458 | 'channel': {
459 | 'title': 'Watchdogs',
460 | 'description': f'Watchdog on web change, v{__version__}.',
461 | 'link': source_link,
462 | },
463 | 'items': []
464 | }
465 | for task in tasks:
466 | pubDate: str = task['last_change_time'].strftime(
467 | format='%a, %d %b %Y %H:%M:%S')
468 | latest_result: dict = loads(task['latest_result'] or '{}')
469 | if isinstance(latest_result, list):
470 | logger.error(f'latest_result is list: {latest_result}')
471 | link: str = latest_result.get('url') or task['origin_url']
472 | description: str = latest_result.get('text') or ''
473 | title: str = f'{task["name"]}#{latest_result.get("title", description[:Config.TEXT_SLICE_LENGTH])}'
474 | item: dict = {
475 | 'title': title,
476 | 'link': link,
477 | 'guid': title,
478 | 'description': description,
479 | 'pubDate': pubDate
480 | }
481 | xml_data['items'].append(item)
482 | xml: str = gen_rss(xml_data)
483 | response = Response(
484 | content=xml,
485 | media_type="application/xml",
486 | headers={'Content-Type': 'application/xml; charset="utf-8"'})
487 | return response
488 |
489 |
490 | @app.post("/lite")
491 | async def post_lite(request: Request, tag: str = '', sign: str = ''):
492 | task_id = loads(await request.body())['task_id']
493 | tasks, _ = await query_tasks(task_id=task_id)
494 | if tasks:
495 | task = tasks[0]
496 | try:
497 | result_list = loads(
498 | task['result_list']) if task['result_list'] else []
499 | except JSONDecodeError:
500 | result_list = []
501 | return {'result_list': result_list}
502 | else:
503 | return {'result_list': []}
504 |
505 |
506 | @app.get("/lite")
507 | async def lite(
508 | request: Request,
509 | tag: str = '',
510 | sign: str = '',
511 | page: int = 1,
512 | group_ids: str = '',
513 | ):
514 | if group_ids:
515 | task_ids = tuple(await query_group_task_ids(group_ids))
516 | if not task_ids:
517 | return JSONResponse(
518 | status_code=404,
519 | content={
520 | "message": 'query no tasks',
521 | },
522 | )
523 | tasks, has_more = await query_tasks(task_ids=task_ids, page=page)
524 | else:
525 | tasks, has_more = await query_tasks(tag=tag, page=page)
526 | now = datetime.now()
527 | for task in tasks:
528 | result = loads(task['latest_result'] or '{}')
529 | # set / get cache from task
530 | task['url'] = task.get('url') or result.get('url') or task['origin_url']
531 | task['text'] = task.get('text') or result.get('title') or result.get(
532 | 'text') or ''
533 | task['timeago'] = timeago(
534 | (now - task['last_change_time']).total_seconds(),
535 | 1,
536 | 1,
537 | short_name=True)
538 | context = {'tasks': tasks, 'request': request}
539 | context['version'] = __version__
540 | if group_ids:
541 | params = {'group_ids': group_ids}
542 | else:
543 | params = {'tag': tag}
544 | context['home_url'] = Config.get_route('/lite', **params)
545 | if has_more:
546 | if group_ids:
547 | next_page_url = Config.get_route('/lite', page=page + 1, **params)
548 | else:
549 | next_page_url = Config.get_route('/lite', page=page + 1, **params)
550 | else:
551 | next_page_url = ''
552 | context['next_page_url'] = next_page_url
553 | if page > 1:
554 | if group_ids:
555 | last_page_url = Config.get_route('/lite', page=page - 1, **params)
556 | else:
557 | last_page_url = Config.get_route('/lite', page=page - 1, **params)
558 | else:
559 | last_page_url = ''
560 | context['last_page_url'] = last_page_url
561 | context['rss_url'] = Config.get_route('/rss', **params)
562 | return templates.TemplateResponse("lite.html", context=context)
563 |
564 |
565 | @app.get("/feeds")
566 | async def feeds(
567 | request: Request,
568 | tag: str = '',
569 | # user: str = '',
570 | sign: str = '',
571 | page: int = 1,
572 | # page_size: int = Config.default_page_size,
573 | group_ids: str = '',
574 | ):
575 | error_tasks = []
576 | if group_ids:
577 | task_ids = tuple(await query_group_task_ids(group_ids))
578 | if not task_ids:
579 | return JSONResponse(
580 | status_code=404,
581 | content={
582 | "message": 'query no tasks',
583 | },
584 | )
585 | feeds, has_more = await query_feeds(task_ids=task_ids,
586 | tag=tag,
587 | page=page)
588 | if page == 1:
589 | error_tasks.extend(await query_task_errors(tag=tag,
590 | task_ids=task_ids))
591 | else:
592 | feeds, has_more = await query_feeds(tag=tag, page=page)
593 | if page == 1:
594 | error_tasks.extend(await query_task_errors(tag=tag))
595 | now = datetime.now()
596 | _feeds = []
597 | current_date = None
598 | today = datetime.today().strftime('%Y-%m-%d')
599 | for feed in feeds:
600 | date = feed['ts_create'].strftime('%Y-%m-%d')
601 | if date != current_date:
602 | current_date = date
603 | if date == today:
604 | date += ' [Today]'
605 | _feeds.append({'current_date': date})
606 | feed['timeago'] = timeago((now - feed['ts_create']).total_seconds(),
607 | 1,
608 | 1,
609 | short_name=True)
610 | _feeds.append(feed)
611 | context = {'feeds': _feeds, 'request': request, 'error_tasks': error_tasks}
612 | context['version'] = __version__
613 | if group_ids:
614 | params = {'group_ids': group_ids}
615 | else:
616 | params = {'tag': tag}
617 | context['home_url'] = Config.get_route('/feeds', **params)
618 | if has_more:
619 | if group_ids:
620 | next_page_url = Config.get_route('/feeds', page=page + 1, **params)
621 | else:
622 | next_page_url = Config.get_route('/feeds', page=page + 1, **params)
623 | else:
624 | next_page_url = ''
625 | context['next_page_url'] = next_page_url
626 | if page > 1:
627 | if group_ids:
628 | last_page_url = Config.get_route('/feeds', page=page - 1, **params)
629 | else:
630 | last_page_url = Config.get_route('/feeds', page=page - 1, **params)
631 | else:
632 | last_page_url = ''
633 | context['last_page_url'] = last_page_url
634 | context['rss_url'] = Config.get_route('/rss_feeds', **params)
635 | return templates.TemplateResponse("feeds.html", context=context)
636 |
637 |
638 | @app.get("/rss_feeds")
639 | async def rss_feeds(request: Request,
640 | tag: str = '',
641 | sign: str = '',
642 | host: str = Header('', alias='Host'),
643 | group_ids: str = ''):
644 | if group_ids:
645 | task_ids = tuple(await query_group_task_ids(group_ids))
646 | if not task_ids:
647 | return JSONResponse(
648 | status_code=404,
649 | content={
650 | "message": 'query no tasks',
651 | },
652 | )
653 | feeds, _ = await query_feeds(task_ids=task_ids)
654 | else:
655 | feeds, _ = await query_feeds(tag=tag)
656 | source_link = f'https://{host}'
657 | xml_data: dict = {
658 | 'channel': {
659 | 'title': 'Watchdogs Timeline',
660 | 'description': f'Watchdog on web change, v{__version__}.',
661 | 'link': source_link,
662 | },
663 | 'items': []
664 | }
665 | for feed in feeds:
666 | pubDate: str = feed['ts_create'].strftime(
667 | format='%a, %d %b %Y %H:%M:%S')
668 | link: str = feed['url']
669 | description: str = feed['text']
670 | title: str = f'{feed["name"]}#{description[:Config.TEXT_SLICE_LENGTH]}'
671 | item: dict = {
672 | 'title': title,
673 | 'link': link,
674 | 'guid': str(feed['id']),
675 | 'description': description,
676 | 'pubDate': pubDate
677 | }
678 | xml_data['items'].append(item)
679 | xml: str = gen_rss(xml_data)
680 | response = Response(
681 | content=xml,
682 | media_type="application/xml",
683 | headers={'Content-Type': 'application/xml; charset="utf-8"'})
684 | return response
685 |
686 |
687 | @app.get("/groups")
688 | async def groups_route(request: Request):
689 | groups = await query_all_groups()
690 | for _group in groups:
691 | _group['href_feeds'] = Config.get_route('/feeds',
692 | group_ids=_group['id'])
693 | _group['href_lite'] = Config.get_route('/lite', group_ids=_group['id'])
694 | context = {
695 | 'request': request,
696 | 'groups': groups,
697 | }
698 | return templates.TemplateResponse("groups.html", context=context)
699 |
700 |
701 | @app.post("/update_group")
702 | async def update_group(group: Group, action: str):
703 | try:
704 | db = Config.db
705 | # check exist
706 | if action == 'new':
707 | # insert new task
708 | query = groups.insert()
709 | values = dict(group)
710 | # insert with task_id is None
711 | resp = await db.execute(query=query, values=values)
712 | elif action == 'delete':
713 | query = 'delete from groups where `id`=:id'
714 | values = {'id': group.id}
715 | resp = await db.execute(query=query, values=values)
716 | else:
717 | # update old task
718 | query = 'update groups set `name`=:name,`task_ids`=:task_ids where `id`=:id'
719 | values = {
720 | 'id': group.id,
721 | 'name': group.name,
722 | 'task_ids': group.task_ids,
723 | }
724 | resp = await db.execute(query=query, values=values)
725 | result = {'msg': 'ok', 'resp': str(resp)}
726 | except Exception as e:
727 | result = {'msg': repr(e)}
728 | finally:
729 | query_all_groups.cache_clear()
730 | query_group_task_ids.cache_clear()
731 | logger.info(f'[{action.title()}] {group}: {result}')
732 | return result
733 |
734 |
735 | @app.get("/sqlite")
736 | async def download_db():
737 | if Config.db_url.startswith('sqlite:///'):
738 | return FileResponse(path=Config.db_url.replace('sqlite:///', ''))
739 | return Response(content=b'not sqlite', status_code=404)
740 |
--------------------------------------------------------------------------------