├── images ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── d1.png ├── d2.png ├── d3.png ├── d4.png └── d5.png ├── watchdogs ├── __main__.py ├── static │ ├── css │ │ ├── fonts │ │ │ └── element-icons.woff │ │ ├── watchdogs.min.css │ │ └── watchdogs.css │ ├── img │ │ └── favicon.svg │ └── js │ │ ├── clipboard.min.js │ │ ├── watchdogs.min.js │ │ └── vue-resource.min.js ├── __init__.py ├── background.py ├── templates │ ├── auth.html │ ├── logs.html │ ├── groups.html │ ├── feeds.html │ ├── lite.html │ └── index.html ├── main.py ├── callbacks.py ├── config.py ├── utils.py ├── settings.py ├── crawler.py ├── models.py └── app.py ├── run_server.py ├── requirements.txt ├── LICENSE ├── .gitignore ├── setup.py ├── quick_start.md └── README.md /images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/1.png -------------------------------------------------------------------------------- /images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/2.png -------------------------------------------------------------------------------- /images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/3.png -------------------------------------------------------------------------------- /images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/4.png -------------------------------------------------------------------------------- /images/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/5.png -------------------------------------------------------------------------------- /images/d1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d1.png -------------------------------------------------------------------------------- /images/d2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d2.png -------------------------------------------------------------------------------- /images/d3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d3.png -------------------------------------------------------------------------------- /images/d4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d4.png -------------------------------------------------------------------------------- /images/d5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d5.png -------------------------------------------------------------------------------- /watchdogs/__main__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /watchdogs/static/css/fonts/element-icons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/watchdogs/static/css/fonts/element-icons.woff -------------------------------------------------------------------------------- /run_server.py: -------------------------------------------------------------------------------- 1 | from watchdogs.__main__ import main 2 | 3 | if __name__ == "__main__": 4 | # 1. pip install watchdogs 5 | # 2. python -m watchdogs 6 | main() 7 | -------------------------------------------------------------------------------- /watchdogs/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .config import Config 4 | from .main import init_app 5 | 6 | __version__ = '2.0.1' 7 | __all__ = ['Config', 'init_app'] 8 | logging.getLogger('watchdogs').addHandler(logging.NullHandler()) 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles 2 | async_lru 3 | beautifulsoup4 4 | SQLAlchemy==1.4.41 5 | databases>=0.5.5 6 | pydantic<2.0.0 7 | fastapi 8 | fire 9 | jinja2 10 | jmespath 11 | jsonpath-rw-ext 12 | lxml 13 | objectpath 14 | pyyaml>=5.3 15 | selectolax 16 | toml 17 | torequests>=5.0.4 18 | uniparser>=3.0.2 19 | uvicorn 20 | -------------------------------------------------------------------------------- /watchdogs/static/css/watchdogs.min.css: -------------------------------------------------------------------------------- 1 | .full-screen,body,.el-tabs__content,.el-tabs__content > *{width:100%;height:100%;}html{margin:0 auto;zoom:90%;width:90%;height:90%;}.el-tabs__item{font-weight:bold;}.el-message-box--center{min-width:50%;}.el-message-box{width:auto;}.time-td{min-width:16em;padding-left:3em;}#input_host_form > .el-form-item:first-child .el-form-item__content,#input_host_form > .el-form-item:first-child{width:100%;}[aria-label='Edit Crawler JSON'] .el-textarea__inner{height:10em;}.el-table_1_column_8 > .cell{white-space:nowrap;}div.foot{display:flex;justify-content:center;}.host-tag{margin:0.5em;cursor:pointer;}.el-table .warning-row{background:oldlace;}.cb_name{cursor:pointer;padding-left:1em;}p.custom_links{text-align:center;color:black;background-color:rgba(223,223,223,0.5);padding:0.5em 0 0.5em 0;box-shadow:3px 3px 5px #888888;}.request_args_pre{font-size:0.9em;}[v-cloak]{display:none;}.el-popover{max-width:50%;}.el-message-box.work_hours_doc{width:40%;}pre{word-wrap:break-word;white-space:pre-wrap;} 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ClericPy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /watchdogs/background.py: -------------------------------------------------------------------------------- 1 | from asyncio import ensure_future, sleep 2 | from .utils import check_work_time, solo, try_catch 3 | from .config import Config 4 | 5 | 6 | async def crawl_chunks(crawl_once): 7 | loop_num = 0 8 | while not Config.is_shutdown: 9 | has_more = await crawl_once() 10 | if isinstance(has_more, Exception): 11 | Config.logger.error(f'crawl_once error, {has_more!r}') 12 | break 13 | Config.logger.info( 14 | f'crawl_once finished, has_more: {has_more}, loop: {loop_num}') 15 | if not has_more: 16 | break 17 | loop_num += 1 18 | 19 | 20 | async def background_loop(coro_funcs: list = None): 21 | while not Config.is_shutdown: 22 | # non-block running, and be constrained by SoloLock class 23 | for func in coro_funcs: 24 | if func.__name__ == 'crawl_once': 25 | ensure_future(try_catch(crawl_chunks, func)) 26 | else: 27 | ensure_future(try_catch(func)) 28 | await sleep(Config.check_interval) 29 | 30 | 31 | async def db_backup_handler(): 32 | logger = Config.logger 33 | if check_work_time(Config.db_backup_time): 34 | logger.warning(f'Backup DB start: {Config.db_backup_time}.') 35 | # may raise solo error 36 | with solo: 37 | result = await try_catch(Config.db_backup_function) 38 | logger.info(f'Backup DB finished: {result!r}') 39 | -------------------------------------------------------------------------------- /watchdogs/static/css/watchdogs.css: -------------------------------------------------------------------------------- 1 | .full-screen, 2 | body, 3 | .el-tabs__content, 4 | .el-tabs__content > * { 5 | width: 100%; 6 | height: 100%; 7 | } 8 | 9 | html { 10 | margin: 0 auto; 11 | zoom: 90%; 12 | width: 90%; 13 | height: 90%; 14 | } 15 | 16 | .el-tabs__item { 17 | font-weight: bold; 18 | } 19 | 20 | .el-message-box--center { 21 | min-width: 50%; 22 | } 23 | 24 | .el-message-box { 25 | width: auto; 26 | } 27 | 28 | .time-td { 29 | min-width: 16em; 30 | padding-left: 3em; 31 | } 32 | 33 | #input_host_form > .el-form-item:first-child .el-form-item__content, 34 | #input_host_form > .el-form-item:first-child { 35 | width: 100%; 36 | } 37 | 38 | [aria-label='Edit Crawler JSON'] .el-textarea__inner { 39 | height: 10em; 40 | } 41 | 42 | .el-table_1_column_8 > .cell { 43 | white-space: nowrap; 44 | } 45 | 46 | div.foot { 47 | display: flex; 48 | justify-content: center; 49 | } 50 | 51 | .host-tag { 52 | margin: 0.5em; 53 | cursor: pointer; 54 | } 55 | 56 | .el-table .warning-row { 57 | background: oldlace; 58 | } 59 | 60 | .cb_name { 61 | cursor: pointer; 62 | padding-left: 1em; 63 | } 64 | 65 | p.custom_links { 66 | text-align: center; 67 | color: black; 68 | background-color: rgba(223, 223, 223, 0.5); 69 | padding: 0.5em 0 0.5em 0; 70 | box-shadow: 3px 3px 5px #888888; 71 | } 72 | 73 | .request_args_pre { 74 | font-size: 0.9em; 75 | } 76 | 77 | [v-cloak] { 78 | display: none; 79 | } 80 | .el-popover { 81 | max-width: 50%; 82 | } 83 | .el-message-box.work_hours_doc{ 84 | width: 40%; 85 | } 86 | pre { 87 | word-wrap: break-word; 88 | white-space: pre-wrap; 89 | } 90 | -------------------------------------------------------------------------------- /watchdogs/templates/auth.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | {{action}} Watchdogs v{{version}} 10 | 11 | 19 | 20 | 21 | 22 | 25 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | tmp.py 131 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | import sys 5 | 6 | from setuptools import find_packages, setup 7 | """ 8 | linux: 9 | rm -rf "dist/*";rm -rf "build/*";python3 setup.py bdist_wheel;twine upload "dist/*;rm -rf "dist/*";rm -rf "build/*"" 10 | win32: 11 | rm -r -Force dist;rm -r -Force build;python3 setup.py bdist_wheel;twine upload "dist/*";rm -r -Force dist;rm -r -Force build;rm -r -Force watchdogs.egg-info 12 | """ 13 | 14 | py_version = sys.version_info 15 | if py_version.major < 3 or py_version.minor < 6: 16 | raise RuntimeError('Only support python3.6+') 17 | 18 | with open('requirements.txt') as f: 19 | install_requires = [line for line in f.read().strip().split('\n')] 20 | 21 | with open("README.md", encoding="u8") as f: 22 | long_description = f.read() 23 | 24 | if not re.search(r'postgresql|mysql|sqlite', str(sys.argv)): 25 | install_requires.append('aiosqlite') 26 | 27 | here = os.path.abspath(os.path.dirname(__file__)) 28 | with open(os.path.join(here, 'watchdogs', '__init__.py'), encoding="u8") as f: 29 | matched = re.search(r'''__version__ = ['"](.*?)['"]''', f.read()) 30 | if not matched: 31 | raise ValueError('Not find the __version__ info.') 32 | version = matched.group(1) 33 | 34 | description = "Watchdogs to keep an eye on the world's change. Read more: https://github.com/ClericPy/watchdogs." 35 | 36 | setup( 37 | name="watchdogs", 38 | version=version, 39 | keywords="requests crawler uniparser torequests fastapi watchdog", 40 | description=description, 41 | long_description=long_description, 42 | long_description_content_type='text/markdown', 43 | license="MIT License", 44 | install_requires=install_requires, 45 | py_modules=["watchdogs"], 46 | package_data={ 47 | 'watchdogs': [ 48 | 'templates/*.html', 'static/img/*.*', 'static/js/*.js', 49 | 'static/css/*.css', 'static/css/fonts/*.*' 50 | ] 51 | }, 52 | extras_require={ 53 | "postgresql": ["asyncpg", "psycopg2-binary"], 54 | "mysql": ["aiomysql", "pymysql"], 55 | "sqlite": ["aiosqlite"] 56 | }, 57 | classifiers=[ 58 | "License :: OSI Approved :: MIT License", 59 | 'Programming Language :: Python', 60 | "Programming Language :: Python :: 3", 61 | "Programming Language :: Python :: 3.6", 62 | "Programming Language :: Python :: 3.7", 63 | "Programming Language :: Python :: 3.8", 64 | ], 65 | author="ClericPy", 66 | author_email="clericpy@gmail.com", 67 | url="https://github.com/ClericPy/watchdogs", 68 | packages=find_packages(), 69 | platforms="any", 70 | ) 71 | -------------------------------------------------------------------------------- /watchdogs/templates/logs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | Watchdogs logs 13 | 14 | 15 | 16 | 56 | 57 | 58 | 59 |
60 | max_lines: 61 | 67 | refresh_every: 68 | 74 | log_names: 75 | 81 | 82 |
83 | {% for item in items %} 84 |
85 |
86 | {{item['name']}}.log 87 | {{item['line_no']}} lines ( {{item['file_size']}} / 89 | {{item['file_size_mb']}} MB ), st_mtime: {{item['st_mtime']}} 91 | 95 | 97 |
98 |
99 |
{{item['log_text']}}
100 | {% endfor %} 101 | 102 | 103 | -------------------------------------------------------------------------------- /watchdogs/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from fire import Fire 4 | from uvicorn import run 5 | 6 | from .config import NotSet, ensure_dir 7 | from .settings import Config, get_valid_value, setup 8 | 9 | 10 | def clear_dir(dir_path): 11 | if not dir_path.is_dir(): 12 | print(f'Dir is not exist: {dir_path}.') 13 | return True 14 | print(f'Cleaning {dir_path}...') 15 | for f in dir_path.iterdir(): 16 | if f.is_dir(): 17 | clear_dir(f) 18 | else: 19 | f.unlink() 20 | print(f'File removed: {f}') 21 | dir_path.rmdir() 22 | print(f'Folder removed: {dir_path}') 23 | 24 | 25 | def init_app(db_url=None, 26 | password=None, 27 | uninstall=False, 28 | mute_std_log=NotSet, 29 | mute_file_log=NotSet, 30 | md5_salt=None, 31 | config_dir=None, 32 | use_default_cdn=False, 33 | allow_new_request=False, 34 | **uvicorn_kwargs): 35 | if config_dir: 36 | Config.CONFIG_DIR = ensure_dir(config_dir) 37 | if uninstall: 38 | clear_dir(Config.CONFIG_DIR) 39 | sys.exit('Config dir cleared.') 40 | if allow_new_request: 41 | # will allow use requests / aiohttp / tPool / Requests in UDFParser 42 | import aiohttp 43 | import requests 44 | from torequests.dummy import Requests 45 | from torequests.main import tPool 46 | from uniparser.parsers import UDFParser 47 | 48 | UDFParser._GLOBALS_ARGS.update(aiohttp=aiohttp, 49 | requests=requests, 50 | Requests=Requests, 51 | tPool=tPool) 52 | # backward compatibility for ignore_stdout_log & ignore_file_log 53 | Config.mute_std_log = get_valid_value( 54 | [uvicorn_kwargs.pop('ignore_stdout_log', NotSet), mute_std_log], 55 | Config.mute_std_log) 56 | Config.mute_file_log = get_valid_value( 57 | [uvicorn_kwargs.pop('ignore_file_log', NotSet), mute_file_log], 58 | Config.mute_file_log) 59 | # update by given uvicorn_kwargs 60 | Config.uvicorn_kwargs.update(uvicorn_kwargs) 61 | if db_url: 62 | # update by given db_url 63 | Config.db_url = db_url 64 | Config.password = password 65 | Config.md5_salt = md5_salt or '' 66 | setup(use_default_cdn=use_default_cdn) 67 | from .app import app 68 | return app 69 | 70 | 71 | def start_app(db_url=None, 72 | password=None, 73 | uninstall=False, 74 | mute_std_log=NotSet, 75 | mute_file_log=NotSet, 76 | md5_salt=None, 77 | config_dir=None, 78 | use_default_cdn=False, 79 | allow_new_request=False, 80 | **uvicorn_kwargs): 81 | app = init_app(db_url=db_url, 82 | password=password, 83 | uninstall=uninstall, 84 | mute_std_log=mute_std_log, 85 | mute_file_log=mute_file_log, 86 | md5_salt=md5_salt, 87 | config_dir=config_dir, 88 | use_default_cdn=use_default_cdn, 89 | allow_new_request=allow_new_request, 90 | **uvicorn_kwargs) 91 | 92 | run(app, **Config.uvicorn_kwargs) 93 | 94 | 95 | def main(): 96 | argv = sys.argv 97 | if ('-h' in argv or '--help' in argv) and '--' not in argv: 98 | print( 99 | '"-h" and "--help" should be after "--", examples:\n > python -m watchdogs -- -h\n > python run_server.py -- -h' 100 | ) 101 | return 102 | Fire(start_app) 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | -------------------------------------------------------------------------------- /quick_start.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Given a mission 5 | get the most popular repository in the github python trending page. 6 | 1. Here crawl and parse the HTML from https://github.com/trending/python?since=daily 7 | 2. ~~Although you can get it from the api.github.com~~ 8 | 9 | # Create a CrawlerRule 10 | 11 | 1. Get the request args 12 | 1. Use the URL: https://github.com/trending/python?since=daily 13 | 2. Or copy the curl-string from chrome 14 | 1. Chrome dev tools -> Network -> url (RClick) -> Copy -> Copy as cURL 15 | 2. some scenes need the cookie authentication or headers anti-crawler. 16 | 3. ![Copy cURL](https://github.com/ClericPy/watchdogs/raw/master/images/d1.png) 17 | 2. Create crawler rule 18 | 1. Open watchdog page. Default http://127.0.0.1:9901/ 19 | 2. Click \ tab. 20 | 3. First step is to set the CrawlerRule's meta info. 21 | 1. Now start to ensure the request is correct. 22 | 2. Click \ link. 23 | 3. Input the cURL string or URL got before. 24 | 4. ![](https://github.com/ClericPy/watchdogs/raw/master/images/d2.png) 25 | 5. Then it generates the default regex & request args, maybe need some change for match more url pattern. 26 | 6. Click \ button, wait for it finish downloading => Response Body [200] 27 | 1. If after downloading, \ is still null, need to input manually. 28 | 7. Check the source code downloaded, ensure it is what you want. 29 | 1. Also you can check it in the parse rules by using a rule named `__schema__`, the parser will raise Error except this `__schema__` rule returns `True`. 30 | 4. Now to set the ParseRules of this CrawlerRule. 31 | 1. A valid CrawlerRule should contains `text` rule and `url` rule, and the `url` rule is optional. 32 | 2. Delete the existing text rule, create a new parse rule named `list`. 33 | 3. Create a new Parse Rule like as below: ![](https://github.com/ClericPy/watchdogs/raw/master/images/d3.png) 34 | 1. Here we got the list item for child rules. 35 | 4. Then need two child rules named `text` and `url` for the `list` rule. 36 | 5. Create a new parse rule named `text` like this: ![](https://github.com/ClericPy/watchdogs/raw/master/images/d4.png) 37 | 1. Click the button send the `text` rule to `list` rule. 38 | 6. Create a new parse rule named `url` like `text`, or ignore this rule. But `$text` attribute should use `@href` for get href attribute. Also need to send this rule to `list` rule. 39 | 5. OK, now click `Parse` button to parse this CrawlerRule, and get the result. 40 | 6. Click the \<1. Save Crawler Rule\> button to save rule into database. 41 | 42 | > Parse result 43 | 44 | ```javascript 45 | {'Trending Python repositories on GitHub today · GitHub': {'list': {'text': 'gwen001 / pentest-tools', 'url': 'https://github.com/gwen001/pentest-tools'}}} 46 | ``` 47 | 48 | > CrawlerRule JSON. This JSON string can be loaded by clicking the \ button. 49 | 50 | ```javascript 51 | {"name":"Trending Python repositories on GitHub today · GitHub","request_args":{"method":"get","url":"https://github.com/trending/python?since=daily","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"list","chain_rules":[["css","h1.lh-condensed>a","$string"],["python","index","0"],["re","=\"/","@=\"https://github.com/"]],"child_rules":[{"name":"text","chain_rules":[["css","a","$text"],["py","index","0"],["udf","input_object.strip().replace('\\n', '')",""]],"child_rules":[],"iter_parse_child":false},{"name":"url","chain_rules":[["css","a","@href"],["python","index","0"]],"child_rules":[],"iter_parse_child":false}],"iter_parse_child":false}],"regex":"^https://github\\.com/trending/python\\?since=daily$","encoding":""} 52 | ``` 53 | 54 | # Create a Task 55 | 56 | 1. Click the \<2. Add New Task\> button. 57 | 2. Ensure the task info. ![](https://github.com/ClericPy/watchdogs/raw/master/images/d5.png) 58 | 3. Click \ button. Create task success. 59 | 60 | # Update a Task 61 | 62 | 1. Click \ tab. 63 | 2. Double click the task's row. 64 | 3. Update it, submit. 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [watchdogs](https://github.com/ClericPy/watchdogs) [![PyPI](https://img.shields.io/pypi/v/watchdogs?style=plastic)](https://pypi.org/project/watchdogs/)![PyPI - Wheel](https://img.shields.io/pypi/wheel/watchdogs?style=plastic)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/watchdogs?style=plastic)![PyPI - Downloads](https://img.shields.io/pypi/dm/watchdogs?style=plastic)![PyPI - License](https://img.shields.io/pypi/l/watchdogs?style=plastic) 2 | 3 | Keep an eye on the change of web world. 4 | 5 | Such as `post articles` / `news on the web portal` / `server api health` / `binge-watching` / `steam price fluctuation` / `github events` / `updates of comic and novel`, and so on... 6 | 7 | ## Intro 8 | 9 | > [中文文档](https://clericpy.github.io/blog/posts/20200331171211/) 10 | 11 | 1. This is a web app based on [fastapi](https://github.com/tiangolo/fastapi), [databases](https://github.com/encode/databases), [uniparser](https://github.com/ClericPy/uniparser), [torequests](https://github.com/ClericPy/torequests). 12 | 2. Smoothly deploy it by pip: `pip install -U watchdogs;python3 -m watchdogs` 13 | 3. Simple to create a new crawler with the Web UI, not like old ways to write duplicate code. 14 | 4. All the crawlers keep runing in the async environment. 15 | 5. Almost all the elements have a *title* attribute to describe the features in the Web UI, which means docs lay on the UI. 16 | 6. Release your hands from repetitive refreshing pages on the browser. 17 | 1. Subscribe the change events with RSS reminder extensions, such as [Feedbro](https://chrome.google.com/webstore/detail/feedbro/mefgmmbdailogpfhfblcnnjfmnpnmdfa) or RSS Feed Reader. 18 | 2. Implement a class which inherits from `watchdogs.callbacks.Callback`. 19 | 20 | ## Usage 21 | 22 | 1. > pip install -U watchdogs 23 | 24 | 2. > python -m watchdogs 25 | 26 | 3. > Open the browser: http://127.0.0.1:9901 27 | 28 | ### Command line args 29 | 30 | > python -m watchdogs -- -h 31 | 32 | - **db_url**: 33 | > sqlite / mysql / postgresql(not test) url, which [databases](https://github.com/encode/databases) supports. Defaults to 'sqlite:///{HOME_PATH}/watchdogs/storage.sqlite' 34 | - **password**: 35 | > init password, if null can be set on the first visit on web. 36 | - **mute_std_log**: 37 | > remove stdout log for clean stream 38 | - **mute_file_log**: 39 | > ignore file log located at {HOME_PATH}/watchdogs folder. 40 | - **md5_salt**: 41 | > md5_salt for custom md5(password) / md5(rss_tag) 42 | - **config_dir**: 43 | > config dir to save the logs and config files, if using sqlite include sqlite file. defaults to {HOME_PATH}/watchdogs 44 | - **use_default_cdn**: 45 | > If Config.cdn_urls not set, and use_default_cdn is True, will use online js/css cdn links from staticfile.org. 46 | - **\*\*uvicorn_kwargs**: 47 | > uvicorn startup kwargs, such as port, host. Which can be set like: `python -m watchdogs --port=9999 --host=127.0.0.1 --access-log=False` 48 | 49 | ### Quick Start to Create New Task 50 | 51 | [Quick Start Screenshots](https://github.com/ClericPy/watchdogs/blob/master/quick_start.md) 52 | 53 | 54 | ## Web UI 55 | 56 |
57 | Screenshots 58 | 59 | 1. Welcome Page (Tasks Page). 60 | > Here you can see all the tasks meta, goto RSS / Mobile Lite Page, and do some operations to the tasks. 61 | 62 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/1.png) 63 | 64 | 2. New Task Page. 65 | > Here based on the latest [uniparser](https://github.com/ClericPy/uniparser) web app, to create new rules and also tasks. 66 | 67 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/2.png) 68 | 69 | 3. Rules Page. 70 | > Do some operations for the rules. 71 | 72 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/3.png) 73 | 74 | 4. API page. 75 | > Based on [fastapi](https://github.com/tiangolo/fastapi) `/docs` which is generated automatically. 76 | 77 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/4.png) 78 | 79 | 5. Mobile Page (Lite View). 80 | > For mobile phone to glimpse the latest result for the current 30 tasks. 81 | 82 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/5.png) 83 | 84 |
85 | 86 | 92 | -------------------------------------------------------------------------------- /watchdogs/templates/groups.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Watchdogs Groups v{{version}} 10 | 11 | 12 | 50 | 51 | 52 | 53 |
54 |
55 |

56 | Watchdogs 57 |

58 |
59 | 60 | 62 |
63 |
64 |
65 | {% for group in groups %} 66 |
67 | id: 68 | name: 69 | task_ids: 71 | 72 | 74 | Lite 75 | Feed 76 |
77 | {% endfor %} 78 |
79 |
80 |
81 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /watchdogs/static/img/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 15 | 17 | 19 | 56 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /watchdogs/templates/feeds.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Watchdogs Timeline v{{version}} 10 | 11 | 12 | 172 | 173 | 174 | 175 |
176 |
177 |

Watchdogs

178 |
179 |
180 |
181 |
182 | {% for task in error_tasks %} 183 |

[{{task.name}}]: {{task.error}}

184 | {% endfor %} 185 |
186 | {%- if not feeds -%} 187 |

No Feeds.

188 | {% endif %} 189 | {% for feed in feeds %} 190 | {%- if feed.get("name") -%} 191 |
192 |

193 | {{feed.name}} 194 |

195 |

{{feed.text}}

196 |

197 | {{feed.ts_create.strftime('%Y-%m-%d %H:%M:%S')}} 198 | 199 | - {{feed.timeago}} ago 200 |

201 |
202 | {% endif %} 203 | {%- if feed.get("current_date") -%} 204 | 205 | {% endif %} 206 | {% endfor %} 207 |
208 |
209 |
210 | {% if last_page_url %} 211 | < 212 | {% endif %} 213 | Home 214 | RSS 215 | {% if next_page_url %} 216 | > 217 | {% endif %} 218 |
219 |
220 | 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /watchdogs/callbacks.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from json import loads 3 | from logging import getLogger 4 | from traceback import format_exc 5 | from typing import Dict, Type 6 | 7 | from torequests.utils import ttime 8 | 9 | from .utils import ensure_await_result 10 | 11 | 12 | class CallbackHandlerBase(ABC): 13 | logger = getLogger('watchdogs') 14 | 15 | def __init__(self): 16 | # lazy init object 17 | self.callbacks_dict: Dict[str, Type[Callback]] = {} 18 | for cls in Callback.__subclasses__(): 19 | try: 20 | assert cls.name is not None 21 | cls.doc = cls.doc or cls.__doc__ 22 | self.callbacks_dict[cls.name] = cls 23 | except Exception as err: 24 | self.logger.error(f'{cls} registers failed: {err!r}') 25 | self.workers = {cb.name: cb.doc for cb in self.callbacks_dict.values()} 26 | 27 | @abstractmethod 28 | async def callback(self, task): 29 | pass 30 | 31 | def get_callback(self, name): 32 | obj = self.callbacks_dict.get(name) 33 | if not obj: 34 | # not found callback 35 | return None 36 | if not isinstance(obj, Callback): 37 | # here for lazy init 38 | obj = obj() 39 | self.callbacks_dict[name] = obj 40 | return obj 41 | 42 | 43 | class CallbackHandler(CallbackHandlerBase): 44 | 45 | def __init__(self): 46 | super().__init__() 47 | 48 | async def callback(self, task): 49 | custom_info: str = task.custom_info.strip() 50 | name = custom_info.split(':', 1)[0] 51 | cb = self.get_callback(name) or self.get_callback('') 52 | if not cb: 53 | # not found callback, ignore 54 | return 55 | try: 56 | call_result = await ensure_await_result(cb.callback(task)) 57 | self.logger.info( 58 | f'{cb.name or "default"} callback({custom_info}) for task {task.name} {call_result}: ' 59 | ) 60 | except Exception: 61 | self.logger.error( 62 | f'{cb.name or "default"} callback({custom_info}) for task {task.name} error:\n{format_exc()}' 63 | ) 64 | 65 | 66 | class Callback(ABC): 67 | """ 68 | Constraint: Callback object should has this attribute: 69 | cls.name: str 70 | self.callback(task) 71 | if name == '': It's the default callback for null custom info. 72 | More common notify middleware is coming. 73 | """ 74 | logger = getLogger('watchdogs') 75 | # reset by subclass 76 | name: str = None 77 | doc = '' 78 | 79 | @abstractmethod 80 | def callback(self, task): 81 | """task attributes is new crawled""" 82 | pass 83 | 84 | 85 | class ServerChanCallback(Callback): 86 | """ 87 | Wechat notify toolkit. 88 | 89 | 1. Login with github: http://sc.ftqq.com/ 90 | 2. Click http://sc.ftqq.com/?c=code the SCKEY 91 | 3. Set the task.custom_info as: server_chan:{SCKEY} 92 | """ 93 | name = "server_chan" 94 | 95 | # doc = 'http://sc.ftqq.com/' 96 | TEXT_SLICE_LENGTH = 200 97 | 98 | def __init__(self): 99 | from torequests.dummy import Requests 100 | self.req = Requests() 101 | 102 | async def callback(self, task): 103 | name, arg = task.custom_info.split(':', 1) 104 | if not arg: 105 | raise ValueError( 106 | f'{task.name}: custom_info `{task.custom_info}` missing args after `:`' 107 | ) 108 | latest_result = loads(task.latest_result or '{}') 109 | text = latest_result.get('text') or '' 110 | url = latest_result.get('url') or task.origin_url 111 | title = f'{task.name}#{text[:self.TEXT_SLICE_LENGTH]}' 112 | body = f'{url}\n\n{text}' 113 | oks = [] 114 | for key in set(arg.strip().split()): 115 | if not key or not key.strip(): 116 | continue 117 | key = key.strip() 118 | r = await self.req.post(f'https://sc.ftqq.com/{key}.send', 119 | data={ 120 | 'text': title, 121 | 'desp': body 122 | }) 123 | self.logger.info(f'ServerChanCallback ({key}): {r.text}') 124 | oks.append((key, bool(r))) 125 | return f'{len(oks)} sended, {oks}' 126 | 127 | 128 | class DingTalkCallback(Callback): 129 | """ 130 | DingDing robot notify toolkit. Will auto check msg type as text / card. 131 | 132 | 1. Create a group. 133 | 2. Create a robot which contains word ":" 134 | 3. Set the task.custom_info as: dingding:{access_token} 135 | 136 | Doc: https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq/e9d991e2 137 | """ 138 | name = "dingding" 139 | 140 | def __init__(self): 141 | from torequests.dummy import Requests 142 | self.req = Requests() 143 | 144 | def make_data(self, task): 145 | latest_result = loads(task.latest_result or '{}') 146 | title = latest_result.get('title') or '' 147 | url = latest_result.get('url') or task.origin_url 148 | text = latest_result.get('text') or '' 149 | cover = latest_result.get('cover') or '' 150 | if cover: 151 | text = f'![cover]({cover})\n{text}' 152 | if url or cover: 153 | # markdown 154 | title = f'# {task.name}: {title}\n> {ttime()}' 155 | return { 156 | "actionCard": { 157 | "title": title, 158 | "text": f'{title}\n\n{text}', 159 | "singleTitle": "Read More", 160 | "singleURL": url 161 | }, 162 | "msgtype": "actionCard" 163 | } 164 | return { 165 | "msgtype": "text", 166 | "text": { 167 | "content": f"{task.name}: {title}\n{text}" 168 | } 169 | } 170 | 171 | async def callback(self, task): 172 | name, arg = task.custom_info.split(':', 1) 173 | if not arg: 174 | raise ValueError( 175 | f'{task.name}: custom_info `{task.custom_info}` missing args after `:`' 176 | ) 177 | 178 | data = self.make_data(task) 179 | oks = [] 180 | for access_token in set(arg.strip().split()): 181 | if not access_token or not access_token.strip(): 182 | continue 183 | access_token = access_token.strip() 184 | r = await self.req.post( 185 | f'https://oapi.dingtalk.com/robot/send?access_token={access_token}', 186 | json=data) 187 | self.logger.info( 188 | f'{self.__class__.__name__} ({access_token}): {r.text}') 189 | oks.append((access_token, bool(r))) 190 | return f'{len(oks)} sended, {oks}' 191 | -------------------------------------------------------------------------------- /watchdogs/templates/lite.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Watchdogs Lite v{{version}} 10 | 11 | 12 | 168 | 169 | 170 | 171 |
172 |
173 |

Watchdogs

174 |
175 |
176 |
177 | {%- if not tasks -%} 178 |

No Tasks.

179 | {% endif %} {% for task in tasks %} 180 |
181 |

182 | {{task.name}} 183 |

184 |

{{task.text}}

185 |

186 | {{ '✔' if task.enable else '✖'}} 187 | {{task.last_change_time.strftime('%Y-%m-%d %H:%M:%S')}} 188 | 189 | - {{task.timeago}} ago 190 | 📂 191 | View More 192 | {%- if task.error -%} 193 |
194 | 195 | Error: {{task.error}} 196 | 197 | {% endif %} 198 |

199 |
200 | {% endfor %} 201 |
202 |
203 |
204 | {% if last_page_url %} 205 | < 206 | {% endif %} 207 | Home 208 | RSS 209 | {% if next_page_url %} 210 | > 211 | {% endif %} 212 |
213 |
214 | 265 | 266 | 267 | 268 | -------------------------------------------------------------------------------- /watchdogs/config.py: -------------------------------------------------------------------------------- 1 | from logging import ERROR, INFO, Formatter, getLogger 2 | from pathlib import Path 3 | from time import time 4 | from traceback import format_exc 5 | from typing import Any, Callable, Dict, List 6 | 7 | from databases import Database 8 | from fastapi import Request 9 | from fastapi.middleware.gzip import GZipMiddleware 10 | from frequency_controller import AsyncFrequency 11 | from starlette.middleware.base import BaseHTTPMiddleware 12 | from starlette.responses import JSONResponse, RedirectResponse 13 | from torequests.utils import md5 as _md5 14 | from torequests.utils import parse_qsl, quote_plus, unparse_qsl 15 | from uniparser.crawler import RuleStorage 16 | 17 | from .callbacks import CallbackHandlerBase 18 | 19 | logger = getLogger('watchdogs') 20 | logger.setLevel(INFO) 21 | 22 | NotSet = object() 23 | 24 | 25 | # @app.exception_handler(Exception) 26 | async def exception_handler(request: Request, exc: Exception): 27 | trace_id = str(int(time() * 1000)) 28 | err_name = exc.__class__.__name__ 29 | err_value = str(exc) 30 | msg = f'{err_name}({err_value}) trace_id: {trace_id}:\n{format_exc()}' 31 | logger.error(msg) 32 | return JSONResponse( 33 | status_code=500, 34 | content={ 35 | "message": f"Oops! {err_name}.", 36 | "trace_id": trace_id 37 | }, 38 | ) 39 | 40 | 41 | def ensure_dir(path: Path): 42 | if isinstance(path, str): 43 | path = Path(path) 44 | if path.is_dir(): 45 | return path 46 | else: 47 | paths = list(reversed(path.parents)) 48 | paths.append(path) 49 | p: Path 50 | for p in paths: 51 | if not p.is_dir(): 52 | p.mkdir() 53 | return path 54 | 55 | 56 | def get_sign(path, query): 57 | given_sign = '' 58 | query_list = [] 59 | for key, value in parse_qsl(query, keep_blank_values=True): 60 | if key == 'sign': 61 | given_sign = value 62 | else: 63 | query_list.append(f'{key}={value}') 64 | query_list.sort() 65 | valid_sign = md5(f'{path}?{"&".join(query_list)}') 66 | return given_sign, valid_sign 67 | 68 | 69 | async def auth_checker(request: Request, call_next): 70 | # {'type': 'http', 'http_version': '1.1', 'server': ('127.0.0.1', 9901), 'client': ('127.0.0.1', 7037), 'scheme': 'http', 'method': 'GET', 'root_path': '', 'path': '/auth', 'raw_path': b'/auth', 'query_string': b'', 'headers': [(b'host', b'127.0.0.1:9901'), (b'connection', b'keep-alive'), (b'sec-fetch-dest', b'image'), (b'user-agent', b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'), (b'dnt', b'1'), (b'accept', b'image/webp,image/apng,image/*,*/*;q=0.8'), (b'sec-fetch-site', b'same-origin'), (b'sec-fetch-mode', b'no-cors'), (b'referer', b'http://127.0.0.1:9901/auth'), (b'accept-encoding', b'gzip, deflate, br'), (b'accept-language', b'zh-CN,zh;q=0.9'), (b'cookie', b'ads_id=lakdsjflakjdf; _ga=GA1.1.1550108461.1583462251')], 'fastapi_astack': , 'app': } 71 | path = request.scope['path'] 72 | if path in Config.AUTH_PATH_WHITE_LIST: 73 | # ignore auth check 74 | return await call_next(request) 75 | query_string = request.scope.get('query_string', b'').decode('u8') 76 | query_has_sign = 'sign=' in query_string 77 | if query_has_sign: 78 | # try checking sign 79 | given_sign, valid_sign = Config.get_sign(path, query_string) 80 | if given_sign == valid_sign: 81 | # sign checking pass 82 | return await call_next(request) 83 | # try check cookie 84 | if not Config.watchdog_auth or Config.watchdog_auth == request.cookies.get( 85 | 'watchdog_auth', ''): 86 | # valid cookie, or no watchdog_auth checker 87 | return await call_next(request) 88 | # not pass either checker, refused 89 | if query_has_sign: 90 | # request with sign will not redirect 91 | return JSONResponse( 92 | status_code=400, 93 | content={ 94 | "message": 'signature expired', 95 | }, 96 | ) 97 | else: 98 | # bad cookie, reset the watchdog_auth cookie as null 99 | resp = RedirectResponse( 100 | f'/auth?redirect={quote_plus(request.scope["path"])}', 302) 101 | resp.set_cookie('watchdog_auth', '') 102 | return resp 103 | 104 | 105 | class Config: 106 | CONFIG_DIR: Path = ensure_dir(Path.home() / 'watchdogs') 107 | ENCODING = 'utf-8' 108 | AUTH_PATH_WHITE_LIST = {'/auth'} 109 | # db_url defaults to sqlite:// 110 | db_url: str = f'sqlite:///{(CONFIG_DIR / "storage.sqlite").as_posix()}' 111 | db: Database = None 112 | logger = logger 113 | password: str = '' 114 | rule_db: RuleStorage = None 115 | metas = None 116 | check_interval: int = 60 117 | default_interval: int = 5 * 60 118 | default_crawler_timeout: int = 30 119 | downloader_timeout: int = 15 120 | watchdog_auth: str = '' 121 | md5_salt: str = '' 122 | crawler = None 123 | # anti brute force attack 124 | check_pwd_freq = AsyncFrequency(1, 3) 125 | # for anti-crawl frequency 126 | DEFAULT_HOST_FREQUENCY = (1, 1) 127 | cdn_urls: dict = {} 128 | callback_handler: CallbackHandlerBase = None 129 | mute_std_log = False 130 | mute_file_log = False 131 | LOGGING_FILE_CONFIG = { 132 | 'info.log': { 133 | 'file_size_mb': 2, 134 | 'level': INFO, 135 | 'backup_count': 1, 136 | }, 137 | 'error.log': { 138 | 'file_size_mb': 2, 139 | 'level': ERROR, 140 | 'backup_count': 1, 141 | }, 142 | 'server.log': { 143 | 'file_size_mb': 2, 144 | 'level': INFO, 145 | 'backup_count': 1, 146 | }, 147 | } 148 | DEFAULT_LOGGER_FORMATTER = Formatter( 149 | "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s", 150 | datefmt="%Y-%m-%d %H:%M:%S") 151 | uvicorn_kwargs: dict = {'access_log': True, 'port': 9901} 152 | # check interval 60s, so format do use %M , backup every 12 hours. this pattern may miss for crawl cost more than 60s. 153 | # db_backup_time: str = '%H:%M==00:00|%H:%M==12:00' 154 | db_backup_time: str = '%H:%M==00:00' 155 | db_backup_count: int = 4 156 | db_backup_function: Callable[..., Any] = None 157 | exception_handlers: list = [ 158 | (Exception, exception_handler), 159 | ] 160 | middlewares = [ 161 | { 162 | 'middleware_class': BaseHTTPMiddleware, 163 | 'dispatch': auth_checker 164 | }, 165 | { 166 | 'middleware_class': GZipMiddleware, 167 | 'minimum_size': 1000 168 | }, 169 | ] 170 | md5_cache_maxsize = 128 171 | query_groups_cache_maxsize = 128 172 | query_group_task_ids_cache_maxsize = 128 173 | query_task_ids_cache_maxsize = 128 174 | query_tasks_cache_maxsize = 128 175 | query_feeds_cache_maxsize = 128 176 | metas_cache_maxsize = 128 177 | sign_cache_maxsize = 128 178 | _md5 = _md5 179 | get_sign = get_sign 180 | background_task = None 181 | background_funcs: List[Callable] = [] 182 | is_shutdown = False 183 | custom_links = [ 184 | { 185 | 'label': 'Auth', 186 | 'url': '/auth', 187 | 'desc': 'change your password', 188 | }, 189 | { 190 | 'label': 'Logs', 191 | 'url': '/log', 192 | 'desc': 'view the logs', 193 | }, 194 | { 195 | 'label': 'Docs', 196 | 'url': '/docs', 197 | 'desc': 'read the docs', 198 | }, 199 | { 200 | 'label': 'Groups', 201 | 'url': '/groups', 202 | 'desc': 'admin the groups', 203 | }, 204 | ] 205 | # custom_tabs = [{'name': 'apis', 'label': 'API', 'url': '/docs'}] 206 | custom_tabs: List[Dict] = [] 207 | COLLATION: str = None 208 | cookie_max_age = 86400 * 7 209 | default_page_size = 20 210 | TEXT_SLICE_LENGTH = 200 211 | 212 | @classmethod 213 | def get_route(cls, path, **kwargs): 214 | params_string = unparse_qsl([ 215 | (k, str(v)) for k, v in kwargs.items() if str(v) 216 | ]) 217 | sign = cls.get_sign(path, params_string)[1] 218 | if params_string: 219 | result = f'{path}?{params_string}&sign={sign}' 220 | else: 221 | result = f'{path}?sign={sign}' 222 | return result 223 | 224 | @classmethod 225 | def add_custom_tabs(cls, label, url, name=None, desc=None): 226 | # desc is nonsense 227 | assert name or label 228 | cls.custom_tabs.append({ 229 | 'label': label, 230 | 'name': name or label, 231 | 'url': url, 232 | 'desc': desc 233 | }) 234 | 235 | @classmethod 236 | def add_custom_links(cls, url, name, label=None, desc=None): 237 | assert name or label 238 | cls.custom_tabs.append({ 239 | 'name': name or label, 240 | 'label': label or name, 241 | 'url': url, 242 | 'desc': desc 243 | }) 244 | 245 | @classmethod 246 | def setup_middleware(cls, app): 247 | for middleware in cls.middlewares: 248 | app.add_middleware(**middleware) 249 | 250 | 251 | def md5(obj, n=32, with_salt=True): 252 | if not with_salt: 253 | return Config._md5(str(obj).encode('utf-8'), n=n, skip_encode=True) 254 | salt = Config.md5_salt 255 | if not salt: 256 | raise ValueError('Config.md5_salt should not be null') 257 | return Config._md5(f'{obj}{salt}'.encode('utf-8'), n=n) 258 | 259 | 260 | async def md5_checker(obj, target, freq=False): 261 | if freq: 262 | async with Config.check_pwd_freq: 263 | # anti guessing password 264 | return md5(obj) == target 265 | else: 266 | # may get a cache 267 | return md5(obj) == target 268 | -------------------------------------------------------------------------------- /watchdogs/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | from inspect import isawaitable 4 | from json import dumps, loads 5 | from logging import getLogger 6 | from sys import _getframe 7 | from traceback import format_exc 8 | from typing import Optional 9 | from xml.sax.saxutils import escape 10 | 11 | logger = getLogger('watchdogs') 12 | 13 | 14 | def format_size(size, rounded=2): 15 | unit = 'B' 16 | for _unit in ['B', 'KB', 'MB', 'GB']: 17 | unit = _unit 18 | if size > 1024: 19 | size = size / 1024 20 | else: 21 | break 22 | return f'{round(size, rounded)} {unit}' 23 | 24 | 25 | async def ensure_await_result(result): 26 | if isawaitable(result): 27 | return await result 28 | return result 29 | 30 | 31 | def _check_work_time(work_hours, now: Optional[datetime] = None): 32 | now = now or datetime.now() 33 | if '==' in work_hours: 34 | # check work days, using strftime 35 | fmt, target = work_hours.split('==') 36 | current = now.strftime(fmt) 37 | # check current time format equals to target 38 | return current == target 39 | elif '!=' in work_hours: 40 | # check work days, using strftime 41 | fmt, target = work_hours.split('!=') 42 | current = now.strftime(fmt) 43 | # check current time format equals to target 44 | return current != target 45 | else: 46 | # other hours format 47 | current_hour = now.hour 48 | if work_hours[0] == '[' and work_hours[-1] == ']': 49 | work_hours_list = sorted(loads(work_hours)) 50 | else: 51 | nums = [int(num) for num in re.findall(r'\d+', work_hours)] 52 | work_hours_list = sorted(range(*nums)) 53 | # check if current_hour is work hour 54 | return current_hour in work_hours_list 55 | 56 | 57 | def check_work_time(work_hours, now: Optional[datetime] = None): 58 | """Check time if fit work_hours. 59 | 60 | :: Test Code 61 | 62 | from watchdogs.utils import check_work_time, datetime 63 | 64 | now = datetime.strptime('2020-03-14 11:47:32', '%Y-%m-%d %H:%M:%S') 65 | 66 | oks = [ 67 | '0, 24', 68 | '[1, 2, 3, 11]', 69 | '[1, 2, 3, 11];%Y==2020', 70 | '%d==14', 71 | '16, 24|[11]', 72 | '16, 24|%M==47', 73 | '%M==46|%M==47', 74 | '%H!=11|%d!=12', 75 | '16, 24|%M!=41', 76 | ] 77 | 78 | for work_hours in oks: 79 | ok = check_work_time(work_hours, now) 80 | print(ok, work_hours) 81 | assert ok 82 | 83 | no_oks = [ 84 | '0, 5', 85 | '[1, 2, 3, 5]', 86 | '[1, 2, 3, 11];%Y==2021', 87 | '%d==11', 88 | '16, 24|[12]', 89 | '%M==17|16, 24', 90 | '%M==46|[1, 2, 3]', 91 | '%H!=11&%d!=12', 92 | '%M!=46;%M!=47', 93 | ] 94 | 95 | for work_hours in no_oks: 96 | ok = check_work_time(work_hours, now) 97 | print(ok, work_hours) 98 | assert not ok 99 | 100 | 101 | """ 102 | now = now or datetime.now() 103 | if '|' in work_hours: 104 | if '&' in work_hours or ';' in work_hours: 105 | raise ValueError('| can not use with "&" or ";"') 106 | return any((_check_work_time(partial_work_hour, now) 107 | for partial_work_hour in work_hours.split('|'))) 108 | else: 109 | if ('&' in work_hours or ';' in work_hours) and '|' in work_hours: 110 | raise ValueError('| can not use with "&" or ";"') 111 | return all((_check_work_time(partial_work_hour, now) 112 | for partial_work_hour in re.split('&|;', work_hours))) 113 | 114 | 115 | def get_watchdog_result(item): 116 | """ 117 | Parse result format like: 118 | {'text': 'xxx'} 119 | {'text': 'xxx', 'url': 'xxx'} 120 | {'rule_name': {'text': 'xxx'}} 121 | {'__result__': {'rule_name': {'text': 'xxx'}}} 122 | 123 | def test_result_schema(): 124 | # standard result 125 | result = get_watchdog_result({ 126 | 'url': 'https://www.python.org/dev/peps/pep-0001', 127 | 'text': 'text' 128 | }) 129 | # print(result) 130 | assert result == { 131 | 'url': 'https://www.python.org/dev/peps/pep-0001', 132 | 'text': 'text' 133 | } 134 | # only text 135 | result = get_watchdog_result('https://www.python.org/dev/peps/pep-0001') 136 | # print(result) 137 | assert result == {'text': 'text not found'} 138 | # embed request 139 | result = get_watchdog_result({ 140 | '__request__': 'https://www.python.org/dev/peps/pep-0001', 141 | '__result__': { 142 | 'detail': { 143 | 'text': 'PEP 1 -- PEP Purpose and Guidelines' 144 | } 145 | } 146 | }) 147 | # print(result) 148 | assert result == {'text': 'PEP 1 -- PEP Purpose and Guidelines'} 149 | # embed request list 150 | result = get_watchdog_result({ 151 | '__request__': 'https://www.python.org/dev/peps/pep-0001', 152 | '__result__': { 153 | 'detail': [{ 154 | 'text': 'PEP 1 -- PEP Purpose and Guidelines' 155 | }] 156 | } 157 | }) 158 | # print(result) 159 | assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}] 160 | # embed request list2 161 | result = get_watchdog_result({ 162 | '__request__': 'https://www.python.org/dev/peps/pep-0001', 163 | '__result__': { 164 | 'rule_name': { 165 | '__result__': { 166 | 'detail': [{ 167 | 'text': 'PEP 1 -- PEP Purpose and Guidelines' 168 | }] 169 | } 170 | } 171 | } 172 | }) 173 | # print(result) 174 | assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}] 175 | # child rule result 176 | result = get_watchdog_result({ 177 | 'url': 'https://www.python.org/dev/peps/pep-0001', 178 | 'text': 'text' 179 | }) 180 | # print(result) 181 | assert result == { 182 | 'text': 'text', 183 | 'url': 'https://www.python.org/dev/peps/pep-0001' 184 | } 185 | result = get_watchdog_result({ 186 | 'list': { 187 | 'detail': [{ 188 | 'text': 'Wake up to WonderWidgets!', 189 | 'url': 'all' 190 | }, { 191 | 'text': 'Overview', 192 | 'url': 'all' 193 | }] 194 | } 195 | }) 196 | # print(result) 197 | assert result == [{ 198 | 'text': 'Wake up to WonderWidgets!', 199 | 'url': 'all' 200 | }, { 201 | 'text': 'Overview', 202 | 'url': 'all' 203 | }] 204 | 205 | """ 206 | result = {'text': 'text not found'} 207 | if isinstance(item, dict): 208 | __result__ = item.pop('__result__', None) 209 | if __result__: 210 | # may be __result__ > __result__ > __result__ nested... 211 | return get_watchdog_result(__result__.popitem()[1]) 212 | text = item.get('text') 213 | if text is None: 214 | return get_watchdog_result(item.popitem()[1]) 215 | result = {'text': str(text)} 216 | for key in ['__key__', 'unique', 'key', 'cover', 'url', 'title']: 217 | if key in item: 218 | value = item[key] 219 | if value and str(value): 220 | result[key] = str(value) 221 | 222 | elif isinstance(item, (list, tuple)): 223 | result = [get_watchdog_result(i) for i in item] 224 | return result 225 | 226 | 227 | class SoloLock: 228 | 229 | def __init__(self): 230 | self.runnings: set = set() 231 | 232 | @property 233 | def current_name(self): 234 | return _getframe(2).f_code.co_name 235 | 236 | def acquire(self, name=None): 237 | name = name or self.current_name 238 | if name in self.runnings: 239 | raise RuntimeError(f'[{name}] is still running.') 240 | self.runnings.add(name) 241 | 242 | def release(self, name=None): 243 | name = name or self.current_name 244 | self.runnings.discard(name) 245 | 246 | def __enter__(self): 247 | self.acquire(self.current_name) 248 | return self 249 | 250 | def __exit__(self, *args): 251 | self.release(self.current_name) 252 | return self 253 | 254 | 255 | async def try_catch(func, *args, **kwargs): 256 | try: 257 | return await ensure_await_result(func(*args, **kwargs)) 258 | except BaseException as err: 259 | logger.error( 260 | f'Catch an error while running {func.__name__}: {format_exc()}') 261 | return err 262 | 263 | 264 | def ignore_error(func, *args, **kwargs): 265 | try: 266 | return func(*args, **kwargs) 267 | except BaseException as err: 268 | return err 269 | 270 | 271 | def gen_rss(data): 272 | nodes = [] 273 | channel = data['channel'] 274 | item_keys = ['title', 'description', 'link', 'guid', 'pubDate'] 275 | for item in data['items']: 276 | item_nodes = [] 277 | for key in item_keys: 278 | value = item.get(key) 279 | if value: 280 | item_nodes.append(f'<{key}>{escape(value)}') 281 | nodes.append(''.join(item_nodes)) 282 | items_string = ''.join((f'{tmp}' for tmp in nodes)) 283 | return rf''' 284 | 285 | 286 | {channel['title']} 287 | {channel['link']} 288 | {channel['description']} 289 | 290 | {channel['link']}/static/img/favicon.svg 291 | {channel['title']} 292 | {channel['link']} 293 | 32 294 | 32 295 | 296 | {items_string} 297 | 298 | 299 | ''' 300 | 301 | 302 | def get_result_key(result: dict): 303 | key = result.get('__key__', result.get('key')) 304 | if key: 305 | return key 306 | else: 307 | return dumps(result, sort_keys=True) 308 | 309 | 310 | solo = SoloLock() 311 | -------------------------------------------------------------------------------- /watchdogs/static/js/clipboard.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * clipboard.js v2.0.4 3 | * https://zenorocha.github.io/clipboard.js 4 | * 5 | * Licensed MIT © Zeno Rocha 6 | */ 7 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n({activeName:"tasks",uniparser_iframe_loaded:!1,task_info_visible:!1,rule_info_visible:!1,current_host_rule:{},new_task_form:{},has_more:!0,task_list:[],current_page:0,host_list:[],visible_host_list:[],current_host:"",tag_types:["","success","info","warning","danger"],query_tasks_args:{order_by:"last_change_time",sort:"desc",tag:""},callback_workers:{},custom_links:[],custom_tabs:[],current_cb_doc:"",init_iframe_rule_json:"",clicked_tab_names:{}}),methods:{add_new_task(){try{JSON.parse(this.new_task_form.result_list)}catch(e){this.$alert("Invalid JSON for result_list.");return}try{JSON.parse(this.new_task_form.request_args)}catch(s){this.$alert("Invalid JSON for request_args.");return}this.task_info_visible=!1;let t=JSON.stringify(this.new_task_form);this.$http.post("add_new_task",t).then(e=>{var s=e.body;"ok"==s.msg?(this.$message({message:"Update task "+this.new_task_form.name+" success: "+s.msg,type:"success"}),this.reload_tasks()):this.$message.error({message:"Update task "+this.new_task_form.name+" failed: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},init_iframe_crawler_rule(e){e?this.sub_app.new_rule_json=e:/httpbin\.org\/html/g.test(this.sub_app.new_rule_json)?this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"https://importpython.com/blog/feed/","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["xml","channel>item>title","$text"],["python","getitem","[0]"]],"child_rules":""},{"name":"url","chain_rules":[["xml","channel>item>link","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^https?://importpython.com/blog/feed/$","encoding":""}':this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"http://httpbin.org/html","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["css","body h1","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^http://httpbin.org/html$","encoding":""}',this.sub_app.input_object="",this.sub_app.request_status="",this.sub_app.load_rule()},load_rule(e){this.sub_app.new_rule_json=e,this.sub_app.load_rule()},view_host_by_req(e){let s=JSON.parse(e).url;if(!s){this.$alert("request_args.url should not be null");return}document.getElementById("tab-rules").click(),setTimeout(()=>{this.current_host=new URL(s).hostname},0),this.task_info_visible=!1},view_crawler_rule_by_req(e){if(!e){this.$alert("request_args should not be null");return}this.$http.post("find_crawler_rule",e).then(e=>{var s=e.body;if("ok"==s.msg){let t=JSON.parse(s.result);this.view_crawler_rule(t),this.task_info_visible=!1}else this.$message.error({message:"rule not find in db: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},view_crawler_rule(e){this.rule_info_visible=!1,document.getElementById("tab-new").click(),this.uniparser_iframe_loaded?this.init_iframe_crawler_rule(JSON.stringify(e)):this.init_iframe_rule_json=JSON.stringify(e)},edit_crawler_rule(e){this.$prompt("","Edit Crawler JSON",{confirmButtonText:"OK",cancelButtonText:"Cancel",center:!0,inputType:"textarea",closeOnClickModal:!1,inputValue:JSON.stringify(e,null,2)}).then(({value:e})=>{this.process_crawler_rule("add",JSON.parse(e),0)}).catch(e=>{this.$message({type:"error",message:e})})},process_crawler_rule(e,s,t){let r=JSON.stringify(s||JSON.parse(this.sub_app.current_crawler_rule_json)),a="crawler_rule."+e;1==t&&(a+="?force=1"),this.$http.post(a,r).then(t=>{var r=t.body;"ok"==r.msg?(this.$message({message:e+" rule success",type:"success"}),"pop"==e&&r.result&&this.show_host_rule(this.current_host_rule.host)):"add"==e&&/matched more than 1 rule/g.test(r.msg)?this.$confirm("Failed for url matched more than 1 rule, overwrite it?","Confirm",{confirmButtonText:"Yes",cancelButtonText:"No",type:"error"}).then(()=>{this.process_crawler_rule(e,s,1)}).catch(()=>{this.$message({type:"info",message:"Adding rule canceled."})}):this.$message.error({message:e+" rule failed: "+r.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},show_form_add_new_task(e){if(e){let s="";try{s=this.sub_app.crawler_rule.name}catch(t){console.log(t)}this.new_task_form={task_id:null,name:s,enable:1,tag:"default",error:"",request_args:"",origin_url:"",interval:300,work_hours:"0, 24",max_result_count:30,result_list:"[]",custom_info:""};let r=JSON.parse(this.sub_app.current_crawler_rule_json);this.new_task_form.request_args=JSON.stringify(r.request_args),this.new_task_form.origin_url=r.request_args.url||""}this.task_info_visible=!0},change_enable(e){this.$http.get("enable_task",{params:{task_id:e.task_id,enable:e.enable}}).then(e=>{var s=e.body;"ok"!=s.msg&&this.$message.error({message:"Update enable failed: "+s.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})},sort_change(e){this.query_tasks_args={order_by:e.column.label,sort:(e.column.order||"").replace("ending","")},this.reload_tasks()},reload_tasks(){this.task_list=[],this.current_page=0,this.load_tasks()},load_tasks(){let e=new URLSearchParams(window.location.search).get("tag");e?this.query_tasks_args.tag=e:this.query_tasks_args.tag="",current_page=this.current_page+1,this.query_tasks_args.page=current_page,this.$http.get("load_tasks",{params:this.query_tasks_args}).then(e=>{var s=e.body;"ok"==s.msg?(s.tasks.forEach(e=>{this.task_list.push(e)}),this.has_more=s.has_more,this.current_page=current_page):(this.$message.error({message:"Loading tasks failed: "+s.msg}),this.has_more=s.has_more)},e=>{this.$message.error({message:"connect failed: "+e.status})})},load_hosts(){this.$http.get("load_hosts",{params:{host:this.current_host}}).then(e=>{var s=e.body;this.current_host=s.host||"",this.host_list=s.hosts,this.visible_host_list=this.host_list},e=>{this.$message.error({message:"connect failed: "+e.status})})},init_iframe(){this.sub_app&&(this.init_iframe_crawler_rule(this.init_iframe_rule_json),this.init_iframe_rule_json&&(this.$message.success({message:"Rule loaded."}),this.init_iframe_rule_json=""),this.uniparser_iframe_loaded=!0)},handleClick(e){e.name in this.clicked_tab_names||(this.clicked_tab_names[e.name]=1,"rules"==e.name&&this.load_hosts())},escape_html:e=>e?e.replace(/[&<>'"]/g,e=>({"&":"&","<":"<",">":">","'":"'",'"':"""})[e]||e):"",show_time(e){var s='';JSON.parse(e.result_list||"[]"),s+='",s+='",s+='",s+="
last_check_time'+e.last_check_time.replace(/\..*/,"").replace("T"," ")+"
next_check_time'+e.next_check_time.replace(/\..*/,"").replace("T"," ")+"
last_change_time'+e.last_change_time.replace(/\..*/,"").replace("T"," ")+"
",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},get_latest_result(e,s=80){try{let t=JSON.parse(e);return t.title||t.text.slice(0,s)}catch(r){return e}},show_result_list(e){var s="";JSON.parse(e.result_list||"[]").forEach(e=>{if((result=e.result).url)var t='href="'+(result.url||"")+'"';else var t="";s+='"}),s+="
'+e.time+'"+this.escape_html(result.title||result.text)+"
",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},force_crawl(e,s){this.$http.get("force_crawl",{params:{task_name:s.name}}).then(t=>{var r=t.body;if("ok"==r.msg){let a=r.task;Vue.set(this.task_list,e,a),a.error?this.$message.error({message:"Crawl task "+s.name+" "+a.error}):this.$message.success({message:"Crawl task "+s.name+" success"})}else this.$message.error({message:"Crawl task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"force_crawl connect failed: "+e.status})})},row_db_click(e){this.update_task(e)},show_task_error(e){app.$alert(e.error,"Crawler Error",{closeOnClickModal:!0,closeOnPressEscape:!0,center:!0})},update_task(e){this.new_task_form={task_id:e.task_id,name:e.name,enable:e.enable,tag:e.tag,request_args:e.request_args,origin_url:e.origin_url,interval:e.interval,work_hours:e.work_hours,max_result_count:e.max_result_count,result_list:e.result_list||"[]",custom_info:e.custom_info},this.show_form_add_new_task(!1)},delete_task(e,s){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_task",{params:{task_id:s.task_id}}).then(t=>{var r=t.body;"ok"==r.msg?(this.$message.success({message:"Delete task "+s.name+" success"}),this.task_list.splice(e,1)):this.$message.error({message:"Delete task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},delete_host_rule(e){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.$message.success({message:"Delete host "+e+" rule success"}),this.current_host_rule={},this.rule_info_visible=!1,this.load_hosts()):this.$message.error({message:"Delete host "+e+" rule failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},show_host_rule(e){this.$http.get("get_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.current_host_rule=t.host_rule,this.rule_info_visible=!0):this.$message.error({message:"get_host_rule "+e+" failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})},show_work_hours_doc(){let e=``;this.$alert(e,"work_hours format doc",{dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0,customClass:"work_hours_doc"})},check_error_task({row:e,rowIndex:s}){if(e.error)return"warning-row"},click_cb_name(e){this.current_cb_doc=this.callback_workers[e],this.new_task_form.custom_info=e+":"},update_frequency(){let e=this.current_host_rule.host,s=this.current_host_rule.n||0,t=this.current_host_rule.interval||0;this.$http.get("update_host_freq",{params:{host:e,n:s,interval:t}}).then(r=>{var a=r.body;"ok"==a.msg?(this.$message({message:"Update frequency "+e+": "+a.msg,type:"success"}),this.current_host_rule.n=s,this.current_host_rule.interval=t):this.$message.error({message:"update_frequency "+e+" failed: "+JSON.stringify(a)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}},watch:{current_host:function(e){this.visible_host_list=[],/^https?:\/\//g.test(e)&&(e=new URL(e).hostname,this.current_host=e),this.host_list.forEach(s=>{s.name.includes(e)&&this.visible_host_list.push(s)})},task_info_visible:function(e){e||(this.current_cb_doc="")}},computed:{uni_iframe:()=>document.getElementById("uni_iframe"),sub_app(){let e=this.uni_iframe;if(e)return e.contentWindow.app}}},vue_app=Vue.extend(Main),app=new vue_app({delimiters:["${","}"]}).$mount("#app");(()=>{var e;let s=document.getElementById("init_vars"),t=JSON.parse(window.atob(s.innerHTML));Object.keys(t).forEach(e=>{app[e]=t[e]}),s.parentNode.removeChild(s),new IntersectionObserver(e=>{!(e[0].intersectionRatio<=0)&&app.has_more&&app.load_tasks()}).observe(document.getElementById("auto_load"))})(); 2 | -------------------------------------------------------------------------------- /watchdogs/settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from asyncio import ensure_future, get_event_loop 3 | from datetime import datetime 4 | from functools import lru_cache 5 | from json import dumps, loads 6 | from logging.handlers import RotatingFileHandler 7 | 8 | from frequency_controller import AsyncFrequency 9 | from uniparser.parsers import Uniparser 10 | 11 | from .config import Config, NotSet, ensure_dir, md5 12 | 13 | 14 | def get_valid_value(values: list, default=None, invalid=NotSet): 15 | for value in values: 16 | if value is not invalid: 17 | return value 18 | return default 19 | 20 | 21 | def get_file_handler(file_name, 22 | file_size_mb=2, 23 | backup_count=1, 24 | level=logging.INFO): 25 | handler = RotatingFileHandler( 26 | Config.CONFIG_DIR / file_name, 27 | maxBytes=1024 * 1024 * Config.LOGGING_FILE_CONFIG.get( 28 | file_name, {}).get('file_size_mb', file_size_mb), 29 | backupCount=Config.LOGGING_FILE_CONFIG.get(file_name, {}).get( 30 | 'backup_count', backup_count), 31 | encoding=Config.ENCODING) 32 | handler.setLevel( 33 | Config.LOGGING_FILE_CONFIG.get(file_name, {}).get('level', level)) 34 | handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER) 35 | return handler 36 | 37 | 38 | def get_stream_handler(level=logging.INFO): 39 | handler = logging.StreamHandler() 40 | handler.setLevel(level) 41 | handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER) 42 | return handler 43 | 44 | 45 | def setup_logger(): 46 | watchdogs_logger = logging.getLogger('watchdogs') 47 | uniparser_logger = logging.getLogger('uniparser') 48 | uvicorn_logger = logging.getLogger('uvicorn') 49 | if not Config.mute_file_log: 50 | info_handler = get_file_handler('info.log') 51 | watchdogs_logger.addHandler(info_handler) 52 | uniparser_logger.addHandler(info_handler) 53 | 54 | error_handler = get_file_handler('error.log') 55 | watchdogs_logger.addHandler(error_handler) 56 | uniparser_logger.addHandler(error_handler) 57 | 58 | server_handler = get_file_handler('server.log') 59 | uvicorn_logger.addHandler(server_handler) 60 | 61 | if not Config.mute_std_log: 62 | handler = get_stream_handler() 63 | watchdogs_logger.addHandler(handler) 64 | uniparser_logger.addHandler(handler) 65 | uvicorn_logger.addHandler(handler) 66 | return watchdogs_logger 67 | 68 | 69 | def setup_models(): 70 | from databases import Database 71 | 72 | # lazy import models to config cache size, means set cache after run main.init_app 73 | from .models import Metas, RuleStorageDB, create_tables 74 | 75 | Config.db = Database(Config.db_url) 76 | Config.rule_db = RuleStorageDB(Config.db) 77 | Config.metas = Metas(Config.db) 78 | # if Config.db_backup_function is None and Config.db_url.startswith( 79 | # 'sqlite:///'): 80 | # Config.db_backup_function = default_db_backup_sqlite 81 | create_tables(str(Config.db.url)) 82 | 83 | 84 | async def setup_uniparser(): 85 | import base64 86 | import binascii 87 | import datetime 88 | import math 89 | import random 90 | import re 91 | 92 | import uniparser.fastapi_ui 93 | from torequests.utils import ( 94 | curlparse, 95 | escape, 96 | guess_interval, 97 | itertools_chain, 98 | json, 99 | parse_qs, 100 | parse_qsl, 101 | ptime, 102 | quote, 103 | quote_plus, 104 | slice_by_size, 105 | slice_into_pieces, 106 | split_n, 107 | timeago, 108 | ttime, 109 | unescape, 110 | unique, 111 | unquote, 112 | unquote_plus, 113 | urljoin, 114 | urlparse, 115 | urlsplit, 116 | urlunparse, 117 | ) 118 | from uniparser.config import GlobalConfig 119 | from uniparser.parsers import UDFParser 120 | from uniparser.utils import TorequestsAiohttpAsyncAdapter 121 | UDFParser._GLOBALS_ARGS.update({ 122 | 're': re, 123 | 'datetime': datetime, 124 | 'curlparse': curlparse, 125 | 'math': math, 126 | 'random': random, 127 | 'escape': escape, 128 | 'guess_interval': guess_interval, 129 | 'itertools_chain': itertools_chain, 130 | 'json': json, 131 | 'parse_qs': parse_qs, 132 | 'parse_qsl': parse_qsl, 133 | 'ptime': ptime, 134 | 'quote': quote, 135 | 'quote_plus': quote_plus, 136 | 'slice_by_size': slice_by_size, 137 | 'slice_into_pieces': slice_into_pieces, 138 | 'split_n': split_n, 139 | 'timeago': timeago, 140 | 'ttime': ttime, 141 | 'unescape': unescape, 142 | 'unique': unique, 143 | 'unquote': unquote, 144 | 'unquote_plus': unquote_plus, 145 | 'urljoin': urljoin, 146 | 'urlparse': urlparse, 147 | 'urlsplit': urlsplit, 148 | 'urlunparse': urlunparse, 149 | 'base64': base64, 150 | 'binascii': binascii, 151 | }) 152 | GlobalConfig.GLOBAL_TIMEOUT = Config.downloader_timeout 153 | Uniparser._DEFAULT_ASYNC_FREQUENCY = AsyncFrequency( 154 | *Config.DEFAULT_HOST_FREQUENCY) 155 | await load_host_freqs() 156 | Config.uniparser = Uniparser( 157 | request_adapter=TorequestsAiohttpAsyncAdapter()) 158 | uniparser.fastapi_ui.views.uni = Config.uniparser 159 | 160 | 161 | def setup_cdn_urls(use_default_cdn=False): 162 | from uniparser.fastapi_ui.views import cdn_urls 163 | 164 | if not Config.cdn_urls: 165 | # while cdn_urls not set, check use default cdn or static files. 166 | if use_default_cdn: 167 | # default online cdn 168 | Config.cdn_urls = { 169 | 'VUE_JS_CDN': 'https://cdn.staticfile.org/vue/2.6.11/vue.min.js', 170 | 'ELEMENT_CSS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/theme-chalk/index.css', 171 | 'ELEMENT_JS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/index.js', 172 | 'VUE_RESOURCE_CDN': 'https://cdn.staticfile.org/vue-resource/1.5.1/vue-resource.min.js', 173 | 'CLIPBOARDJS_CDN': 'https://cdn.staticfile.org/clipboard.js/2.0.4/clipboard.min.js', 174 | } 175 | else: 176 | # local statics 177 | Config.cdn_urls = { 178 | 'VUE_JS_CDN': '/static/js/vue.min.js', 179 | 'ELEMENT_CSS_CDN': '/static/css/index.css', 180 | 'ELEMENT_JS_CDN': '/static/js/index.js', 181 | 'VUE_RESOURCE_CDN': '/static/js/vue-resource.min.js', 182 | 'CLIPBOARDJS_CDN': '/static/js/clipboard.min.js', 183 | } 184 | # overwrite uniparser's cdn 185 | cdn_urls.update(Config.cdn_urls) 186 | 187 | 188 | def setup_lru_cache(): 189 | Config._md5 = lru_cache(maxsize=Config.md5_cache_maxsize)(Config._md5) 190 | Config.get_sign = lru_cache(maxsize=Config.sign_cache_maxsize)( 191 | Config.get_sign) 192 | 193 | 194 | def setup(use_default_cdn=False): 195 | setup_logger() 196 | setup_lru_cache() 197 | setup_cdn_urls(use_default_cdn=use_default_cdn) 198 | setup_models() 199 | 200 | 201 | async def setup_md5_salt(): 202 | logger = Config.logger 203 | exist_salt = await Config.metas.get('md5_salt', None) 204 | if not Config.md5_salt: 205 | if exist_salt: 206 | # no need to update 207 | Config.md5_salt = exist_salt 208 | return 209 | else: 210 | # create new salt 211 | from uuid import uuid1 212 | Config.md5_salt = uuid1().hex 213 | elif Config.md5_salt == exist_salt: 214 | # no need to update 215 | return 216 | # need to update: new md5_salt from settings, or no exist_salt 217 | logger.critical(f'Setting md5_salt as {Config.md5_salt}, replaced into db.') 218 | return await Config.metas.set('md5_salt', Config.md5_salt) 219 | 220 | 221 | async def setup_crawler(): 222 | from uniparser import Crawler 223 | 224 | from .callbacks import CallbackHandler 225 | 226 | crawler = Crawler(uniparser=Config.uniparser, storage=Config.rule_db) 227 | Config.crawler = crawler 228 | if Config.callback_handler is None: 229 | Config.callback_handler = CallbackHandler() 230 | workers = ', '.join(Config.callback_handler.callbacks_dict.keys()) 231 | Config.logger.info(f'Current online callbacks: {workers}') 232 | 233 | 234 | async def update_password(password=None): 235 | if password is not None: 236 | Config.password = password 237 | return await Config.metas.set('admin', Config.password) 238 | 239 | 240 | async def refresh_token(): 241 | if Config.password: 242 | await update_password() 243 | password = Config.password 244 | else: 245 | password = await Config.metas.get('admin', '') 246 | if password: 247 | Config.watchdog_auth = md5(password) 248 | 249 | 250 | async def setup_background(): 251 | from .background import background_loop, db_backup_handler 252 | from .crawler import crawl_once 253 | Config.background_funcs.append(crawl_once) 254 | if Config.db_backup_function: 255 | Config.background_funcs.append(db_backup_handler) 256 | Config.background_task = ensure_future( 257 | background_loop(Config.background_funcs)) 258 | 259 | 260 | def setup_exception_handlers(app): 261 | for exc, callback in Config.exception_handlers: 262 | app.add_exception_handler(exc, callback) 263 | 264 | 265 | def mute_noise_logger(): 266 | # uvicorn will set new handler for root logger and access logger after app launched. 267 | logging.getLogger('').handlers.clear() 268 | if Config.uvicorn_kwargs['access_log']: 269 | # fix https://github.com/encode/uvicorn/issues/523 270 | access_logger = logging.getLogger('uvicorn.access') 271 | access_logger.propagate = True 272 | access_logger.handlers.clear() 273 | 274 | 275 | async def setup_app(app): 276 | mute_noise_logger() 277 | db = Config.db 278 | if not db: 279 | raise RuntimeError('No database?') 280 | await db.connect() 281 | await setup_md5_salt() 282 | # refresh_token should be after setup_md5_salt 283 | await refresh_token() 284 | setup_exception_handlers(app) 285 | # 1 286 | await setup_uniparser() 287 | # 2 288 | await setup_crawler() 289 | # 3 290 | await setup_background() 291 | from . import __version__ 292 | Config.logger.info( 293 | f'App started, the current version is {__version__}, CONFIG_DIR: {Config.CONFIG_DIR}' 294 | ) 295 | 296 | 297 | async def release_app(app): 298 | Config.is_shutdown = True 299 | if Config.background_task and not Config.background_task.done(): 300 | Config.background_task.cancel() 301 | if Config.db: 302 | await Config.db.disconnect() 303 | 304 | 305 | async def default_db_backup_sqlite(): 306 | current_time = datetime.now().strftime('%Y%m%d%H%M%S') 307 | for storage_path in Config.CONFIG_DIR.iterdir(): 308 | if storage_path.name == 'storage.sqlite': 309 | import shutil 310 | from pathlib import Path 311 | backup_dir: Path = ensure_dir(Config.CONFIG_DIR / 'backups') 312 | backup_path = backup_dir / f'storage-{current_time}.sqlite' 313 | # 3.6 has no get_running_loop 314 | loop = get_event_loop() 315 | # wait for copy 316 | future = loop.run_in_executor(None, shutil.copy, str(storage_path), 317 | str(backup_path)) 318 | await future 319 | # remove overdue files 320 | backup_file_paths = sorted([i for i in backup_dir.iterdir()], 321 | key=lambda path: path.name, 322 | reverse=True) 323 | path_to_del = backup_file_paths[Config.db_backup_count:] 324 | for p in path_to_del: 325 | p.unlink() 326 | 327 | 328 | def get_host_freq_list(host): 329 | freq = Uniparser._HOST_FREQUENCIES.get(host, None) 330 | if freq: 331 | return [freq.n, freq.interval] 332 | else: 333 | return [None, 0] 334 | 335 | 336 | async def set_host_freq(host, n, interval): 337 | if n: 338 | Uniparser._HOST_FREQUENCIES[host] = AsyncFrequency(n, interval) 339 | else: 340 | Uniparser._HOST_FREQUENCIES.pop(host, None) 341 | await save_host_freqs() 342 | 343 | 344 | async def save_host_freqs(): 345 | items = { 346 | host: freq.to_list() 347 | for host, freq in Uniparser._HOST_FREQUENCIES.items() 348 | } 349 | await Config.metas.set('host_freqs', dumps(items)) 350 | 351 | 352 | async def load_host_freqs(): 353 | host_freqs_str = await Config.metas.get('host_freqs', default='{}') 354 | host_freqs = loads(host_freqs_str) 355 | Uniparser._HOST_FREQUENCIES = { 356 | host: AsyncFrequency(*args) for host, args in host_freqs.items() 357 | } 358 | -------------------------------------------------------------------------------- /watchdogs/static/js/vue-resource.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * vue-resource v1.5.1 3 | * https://github.com/pagekit/vue-resource 4 | * Released under the MIT License. 5 | */ 6 | 7 | !function(t,e){"object"==typeof exports&&"undefined"!=typeof module?module.exports=e():"function"==typeof define&&define.amd?define(e):t.VueResource=e()}(this,function(){"use strict";function u(t){this.state=2,this.value=void 0,this.deferred=[];var e=this;try{t(function(t){e.resolve(t)},function(t){e.reject(t)})}catch(t){e.reject(t)}}u.reject=function(n){return new u(function(t,e){e(n)})},u.resolve=function(n){return new u(function(t,e){t(n)})},u.all=function(s){return new u(function(n,t){var o=0,r=[];function e(e){return function(t){r[e]=t,(o+=1)===s.length&&n(r)}}0===s.length&&n(r);for(var i=0;i Tuple[bool, datetime]: 48 | ''' 49 | Three kinds of format: 50 | 51 | 1. Tow numbers splited by ', ', as work_hours: 52 | 0, 24 means from 00:00 ~ 23:59, for everyday 53 | 2. JSON list of int, as work_hours: 54 | [1, 19] means 01:00~01:59 a.m. 07:00~07:59 p.m. for everyday 55 | 3. Standard strftime format, as work_days: 56 | > Split work_hours by '==', then check 57 | if datetime.now().strftime(wh[0]) == wh[1] 58 | %A==Friday means each Friday 59 | %m-%d==03-13 means every year 03-13 60 | %H==05 means everyday morning 05:00 ~ 05:59 61 | 4. Mix up work_days and work_hours: 62 | > Split work_days and work_hours with ';'/'&' => 'and', '|' => 'or'. 63 | > Support == for equal, != for unequal. 64 | %w==5;20, 24 means every Friday 20:00 ~ 23:59 65 | [1, 2, 15];%w==5 means every Friday 1 a.m. 2 a.m. 3 p.m., the work_hours is on the left side. 66 | %w==5|20, 24 means every Friday or everyday 20:00 ~ 23:59 67 | %w==5|%w==2 means every Friday or Tuesday 68 | %w!=6&%w!=0 means everyday except Saturday & Sunday. 69 | 5. Set a ensure change interval 70 | > If work_hours string endswith `#` and `x` seconds, will check the next_change_time first. 71 | > In other words, I am very sure that the interval between two changes is more than `x` seconds 72 | > So the crawler of this task will not run until the time is `last_change_time + change_interval` 73 | %w==5#86400 means every Friday if it didn't change within 1 day 74 | 0, 24#3600 means each hour if it didn't change within this hour. The task will only be crawled once if it has changed. 75 | ''' 76 | # find the latest hour fit work_hours, if not exist, return next day 00:00 77 | now = now or datetime.now() 78 | work_hours = task.work_hours or '0, 24' 79 | if '#' in work_hours: 80 | # check if changed 81 | last_change_time = task.last_change_time or datetime.fromtimestamp(0) 82 | # split work_hours and change_interval 83 | work_hours, change_interval_str = work_hours.split('#') 84 | change_interval = int(change_interval_str) 85 | # not fit change interval, will wait for left seconds. 86 | next_change_time = last_change_time + timedelta(seconds=change_interval) 87 | if now < next_change_time: 88 | Config.logger.info( 89 | f'Task [{task.name}] has changed in {timeago(change_interval, accuracy=1, format=1, short_name=1)} ago.' 90 | ) 91 | return False, next_change_time 92 | 93 | need_crawl = check_work_time(work_hours, now) 94 | if need_crawl: 95 | # current time is need_crawl, next_check_time is now+interval 96 | next_check_time = now + timedelta(seconds=task.interval) 97 | return need_crawl, next_check_time 98 | else: 99 | # current time is not need_crawl 100 | next_check_time = now 101 | # time machine to update next_check_time fast 102 | for _ in range(60): 103 | # next interval 104 | next_check_time = next_check_time + timedelta(seconds=task.interval) 105 | _need_crawl = check_work_time(work_hours, next_check_time) 106 | if _need_crawl: 107 | # current time is still False, but next_check_time is True 108 | break 109 | return need_crawl, next_check_time 110 | 111 | 112 | async def crawl(task: Task): 113 | crawler: Crawler = Config.crawler 114 | logger = Config.logger 115 | logger.info(f'Start crawling: {task.name}') 116 | crawl_result = await try_catch(crawler.acrawl, task.request_args) 117 | error = '' 118 | if isinstance(crawl_result, RuleNotFoundError): 119 | error = repr(crawl_result) 120 | logger.error(f'{task.name}: {error}') 121 | result_list = [{"text": error}] 122 | elif isinstance(crawl_result, BaseException): 123 | error = getattr(crawl_result, 'text', repr(crawl_result)) 124 | logger.error(f'{task.name}: {error}') 125 | result_list = None 126 | else: 127 | if len(crawl_result) == 1: 128 | # crawl_result schema: {rule_name: list_or_dict} 129 | formated_result = get_watchdog_result( 130 | item=crawl_result.popitem()[1]) 131 | if formated_result == {'text': 'text not found'}: 132 | error = f'{task.name} text not found, crawl result given: {crawl_result}' 133 | logger.error(error) 134 | result_list = None 135 | else: 136 | if isinstance(formated_result, list): 137 | result_list = formated_result 138 | else: 139 | result_list = [formated_result] 140 | # use force crawl one web UI for more log 141 | logger.info(f'{task.name} Crawl success: {result_list}'[:150]) 142 | else: 143 | error = 'Invalid crawl_result against schema {rule_name: [{"text": "Required", "url": "Optional", "key": "Optional", "unique": "Optional"}]}, given is %r' % crawl_result 144 | logger.error(f'{task.name}: {error}') 145 | result_list = [{"text": error}] 146 | return task, error, result_list 147 | 148 | 149 | async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20): 150 | """task_name means force crawl""" 151 | db: Database = Config.db 152 | now = datetime.now() 153 | logger = Config.logger 154 | logger.info(f'crawl_once task_name={task_name} start.') 155 | # sqlite do not has datediff... 156 | if task_name: 157 | query = tasks.select().where(tasks.c.name == task_name) 158 | else: 159 | query = tasks.select().where(tasks.c.enable == 1).where( 160 | tasks.c.next_check_time <= now) 161 | query = query.limit(chunk_size) 162 | todo = [] 163 | now = datetime.now() 164 | update_values = [] 165 | CLEAR_CACHE_NEEDED = False 166 | fetched_tasks = await db.fetch_all(query=query) 167 | has_more = len(fetched_tasks) >= chunk_size 168 | for _task in fetched_tasks: 169 | task = Task(**dict(_task)) 170 | # check work hours 171 | need_crawl, next_check_time = find_next_check_time(task, now) 172 | if task_name: 173 | # always crawl for given task_name 174 | need_crawl = True 175 | if need_crawl: 176 | t = ensure_future(crawl(task)) 177 | # add task_name for logger 178 | setattr(t, 'task_name', task.name) 179 | todo.append(t) 180 | # update next_check_time 181 | values = { 182 | 'last_check_time': now, 183 | 'next_check_time': next_check_time, 184 | 'task_id': task.task_id 185 | } 186 | # update task variable for callback 187 | task.__dict__.update(values) 188 | update_values.append(values) 189 | if not need_crawl: 190 | logger.info( 191 | f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}' 192 | ) 193 | update_query = 'update tasks set `last_check_time`=:last_check_time,`next_check_time`=:next_check_time where task_id=:task_id' 194 | await db.execute_many(query=update_query, values=update_values) 195 | if update_values: 196 | CLEAR_CACHE_NEEDED = True 197 | logger.info(f'crawl_once crawling {len(todo)} valid tasks.') 198 | if todo: 199 | crawl_errors = [] 200 | done, pending = await wait(todo, timeout=Config.default_crawler_timeout) 201 | if pending: 202 | names = [getattr(t, 'task_name', None) for t in pending] 203 | logger.error(f'crawl timeout {len(names)}: {names}') 204 | for _pending in pending: 205 | crawl_errors.append({ 206 | 'task_id': task.task_id, 207 | 'error': 'timeout(%s)' % Config.default_crawler_timeout 208 | }) 209 | _pending.cancel() 210 | ttime_now = ttime() 211 | changed_tasks = [] 212 | update_counts = 0 213 | for t in done: 214 | task, error, result_list = t.result() 215 | if error != task.error: 216 | crawl_errors.append({'task_id': task.task_id, 'error': error}) 217 | if error or result_list is None: 218 | # ignore update this task 219 | continue 220 | # compare latest_result and new list 221 | # later first, just like the saved result_list sortings 222 | old_latest_result = loads(task.latest_result) 223 | # try to use the key, or it self 224 | try: 225 | old_result_list = loads( 226 | task.result_list) if task.result_list else [] 227 | except JSONDecodeError: 228 | old_result_list = [] 229 | if old_latest_result.get('unique', True): 230 | # unique mode will skip all the Duplicated results 231 | exist_keys = { 232 | get_result_key(_old_result['result']) 233 | for _old_result in old_result_list 234 | } 235 | else: 236 | old_latest_result_key = get_result_key(old_latest_result) 237 | exist_keys = {old_latest_result_key} 238 | # list of dict 239 | to_insert_result_list = [] 240 | for result in result_list: 241 | result_key = get_result_key(result) 242 | if result_key in exist_keys: 243 | break 244 | to_insert_result_list.append(result) 245 | if to_insert_result_list: 246 | # update db 247 | update_counts += 1 248 | # new result updated 249 | query = UpdateTaskQuery(task.task_id) 250 | # JSON 251 | new_latest_result = dumps(to_insert_result_list[0], 252 | sort_keys=True) 253 | query.add('latest_result', new_latest_result) 254 | query.add('last_change_time', now) 255 | # older insert first, keep the newer is on the top 256 | new_seeds = [] 257 | for result in to_insert_result_list[::-1]: 258 | # result is dict, not json string 259 | old_result_list.insert(0, { 260 | 'result': result, 261 | 'time': ttime_now 262 | }) 263 | new_seeds.append(result) 264 | await save_feed(new_seeds, db, task) 265 | new_result_list = dumps(old_result_list[:task.max_result_count]) 266 | query.add('result_list', new_result_list) 267 | logger.info(f'[Updated] {task.name}. +++') 268 | await db.execute(**query.kwargs) 269 | task.latest_result = new_latest_result 270 | task.last_change_time = now 271 | task.result_list = new_result_list 272 | changed_tasks.append(task) 273 | if crawl_errors: 274 | update_query = 'update tasks set `error`=:error where task_id=:task_id' 275 | await db.execute_many(query=update_query, values=crawl_errors) 276 | logger.info( 277 | f'Crawl task_name={task_name} finished. Crawled: {len(done)}, Error: {len(crawl_errors)}, Timeout: {len(pending)}, Update: {update_counts}.{" +++" if update_counts else ""}' 278 | ) 279 | for task in changed_tasks: 280 | ensure_future(try_catch(Config.callback_handler.callback, task)) 281 | query_feeds.cache_clear() 282 | else: 283 | logger.info(f'Crawl task_name={task_name} finished. 0 todo.') 284 | if CLEAR_CACHE_NEEDED: 285 | logger.info('Clear cache for crawling new tasks.') 286 | query_tasks.cache_clear() 287 | if task_name: 288 | query = tasks.select().where(tasks.c.name == task_name) 289 | _task = await db.fetch_one(query=query) 290 | return dict(_task) 291 | else: 292 | return has_more 293 | 294 | 295 | async def crawl_once(task_name: Optional[str] = None): 296 | if task_name is not None: 297 | return await _crawl_once(task_name) 298 | with solo: 299 | result = await try_catch(_crawl_once, task_name) 300 | return result 301 | 302 | 303 | async def save_feed(new_seeds, db, task): 304 | if not new_seeds: 305 | return 306 | try: 307 | values = [] 308 | for result in new_seeds: 309 | value = { 310 | 'task_id': task.task_id, 311 | 'name': task.name, 312 | 'text': result.get('title') or result.get('text') or '', 313 | 'url': result.get('url') or task.origin_url, 314 | 'ts_create': datetime.now(), 315 | } 316 | values.append(value) 317 | 318 | query = "INSERT INTO feeds (`task_id`, `name`, `text`, `url`, `ts_create`) values (:task_id, :name, :text, :url, :ts_create)" 319 | result = await db.execute_many(query=query, values=values) 320 | Config.logger.info( 321 | f'Insert task seeds success({task.name}): ({len(values)})') 322 | return result 323 | except Exception: 324 | Config.logger.error( 325 | f'Inserting task seeds failed({task.name}): {format_exc()}') 326 | -------------------------------------------------------------------------------- /watchdogs/models.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | from traceback import format_exc 4 | from typing import Iterable, List, Optional, Set, Tuple, Union 5 | 6 | import sqlalchemy 7 | from async_lru import alru_cache 8 | from databases import Database 9 | from pydantic import BaseModel 10 | from sqlalchemy.sql import text 11 | from uniparser import CrawlerRule, HostRule 12 | from uniparser.crawler import RuleStorage, get_host 13 | 14 | from .config import Config 15 | 16 | if Config.COLLATION is None: 17 | if Config.db_url.startswith('sqlite'): 18 | Config.COLLATION = None 19 | else: 20 | Config.COLLATION = 'utf8_unicode_ci' 21 | 22 | metadata = sqlalchemy.MetaData() 23 | date0 = datetime.strptime('1970-01-01 08:00:00', '%Y-%m-%d %H:%M:%S') 24 | # server_default works instead of default, issue: https://github.com/encode/databases/issues/72 25 | tasks = sqlalchemy.Table( 26 | "tasks", 27 | metadata, 28 | sqlalchemy.Column('task_id', 29 | sqlalchemy.Integer, 30 | primary_key=True, 31 | autoincrement=True), 32 | sqlalchemy.Column("name", 33 | sqlalchemy.String(64, collation=Config.COLLATION), 34 | nullable=False, 35 | index=True, 36 | unique=True), 37 | sqlalchemy.Column("enable", 38 | sqlalchemy.Integer, 39 | server_default=text('1'), 40 | nullable=False), 41 | sqlalchemy.Column("tag", 42 | sqlalchemy.String(128, collation=Config.COLLATION), 43 | server_default="default", 44 | nullable=False), 45 | sqlalchemy.Column("error", sqlalchemy.TEXT(collation=Config.COLLATION)), 46 | sqlalchemy.Column("request_args", 47 | sqlalchemy.TEXT(collation=Config.COLLATION), 48 | nullable=False), 49 | sqlalchemy.Column("origin_url", 50 | sqlalchemy.String(1024), 51 | nullable=False, 52 | server_default=""), 53 | sqlalchemy.Column("interval", 54 | sqlalchemy.Integer, 55 | server_default=text('300'), 56 | nullable=False), 57 | sqlalchemy.Column("work_hours", 58 | sqlalchemy.String(32), 59 | server_default='0, 24', 60 | nullable=False), 61 | sqlalchemy.Column("max_result_count", 62 | sqlalchemy.Integer, 63 | server_default=text('10'), 64 | nullable=False), 65 | sqlalchemy.Column("latest_result", sqlalchemy.TEXT), 66 | sqlalchemy.Column("result_list", sqlalchemy.TEXT), # JSON list 67 | sqlalchemy.Column("last_check_time", 68 | sqlalchemy.TIMESTAMP, 69 | server_default="1970-01-01 08:00:00", 70 | nullable=False), 71 | sqlalchemy.Column("next_check_time", 72 | sqlalchemy.TIMESTAMP, 73 | server_default="1970-01-01 08:00:00", 74 | nullable=False), 75 | sqlalchemy.Column("last_change_time", 76 | sqlalchemy.TIMESTAMP, 77 | server_default="1970-01-01 08:00:00", 78 | index=True, 79 | nullable=False), 80 | sqlalchemy.Column("custom_info", 81 | sqlalchemy.TEXT(collation=Config.COLLATION)), 82 | ) 83 | host_rules = sqlalchemy.Table( 84 | "host_rules", 85 | metadata, 86 | sqlalchemy.Column('host', sqlalchemy.String(128), primary_key=True), 87 | sqlalchemy.Column('host_rule', sqlalchemy.TEXT), 88 | ) 89 | metas = sqlalchemy.Table( 90 | "metas", 91 | metadata, 92 | sqlalchemy.Column('key', 93 | sqlalchemy.String(64, collation=Config.COLLATION), 94 | primary_key=True), 95 | sqlalchemy.Column('value', sqlalchemy.TEXT(collation=Config.COLLATION)), 96 | ) 97 | feeds = sqlalchemy.Table( 98 | "feeds", 99 | metadata, 100 | sqlalchemy.Column('id', 101 | sqlalchemy.Integer, 102 | primary_key=True, 103 | autoincrement=True), 104 | sqlalchemy.Column('task_id', sqlalchemy.Integer, nullable=False), 105 | sqlalchemy.Column("name", 106 | sqlalchemy.String(64, collation=Config.COLLATION), 107 | nullable=False), 108 | # sqlalchemy.Column("tag", 109 | # sqlalchemy.String(128, collation=Config.COLLATION), 110 | # server_default="default", 111 | # nullable=False), 112 | sqlalchemy.Column("text", sqlalchemy.TEXT), 113 | sqlalchemy.Column("url", 114 | sqlalchemy.String(1024), 115 | nullable=False, 116 | server_default=""), 117 | sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False), 118 | ) 119 | groups = sqlalchemy.Table( 120 | "groups", 121 | metadata, 122 | sqlalchemy.Column('id', 123 | sqlalchemy.Integer, 124 | primary_key=True, 125 | autoincrement=True), 126 | sqlalchemy.Column("name", 127 | sqlalchemy.String(64, collation=Config.COLLATION), 128 | nullable=False), 129 | sqlalchemy.Column("task_ids", sqlalchemy.TEXT), 130 | # sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False), 131 | ) 132 | 133 | 134 | def create_tables(db_url): 135 | try: 136 | engine = sqlalchemy.create_engine(db_url) 137 | metadata.create_all(engine) 138 | except BaseException: 139 | Config.logger.critical(f'Fatal error on creating Table: {format_exc()}') 140 | import os 141 | os._exit(1) 142 | 143 | 144 | class RuleStorageDB(RuleStorage): 145 | 146 | def __init__(self, db): 147 | self.db = db 148 | self.logger = Config.logger 149 | 150 | async def commit(self): 151 | pass 152 | 153 | async def get_host_rule(self, host: str, default=None): 154 | query = "SELECT host_rule FROM host_rules WHERE host = :host" 155 | host_rule = await self.db.fetch_one(query=query, values={"host": host}) 156 | if host_rule: 157 | return HostRule.loads(host_rule[0]) 158 | else: 159 | return default 160 | 161 | async def find_crawler_rule(self, url, method='find') -> CrawlerRule: 162 | if not url: 163 | return None 164 | host = get_host(url) 165 | host_rule = await self.get_host_rule(host) 166 | if host_rule: 167 | return host_rule.find(url) 168 | 169 | async def add_crawler_rule(self, rule: CrawlerRule, commit=None): 170 | if isinstance(rule, str): 171 | rule = CrawlerRule.loads(rule) 172 | elif isinstance(rule, dict) and not isinstance(rule, CrawlerRule): 173 | rule = CrawlerRule(**rule) 174 | if not rule.get('regex'): 175 | raise ValueError('regex should not be null') 176 | url = rule.get('request_args', {}).get('url') 177 | if not url: 178 | self.logger.error(f'[Rule] {rule["name"]} not found url.') 179 | return False 180 | host = get_host(url) 181 | if not host: 182 | return False 183 | exist_host_rule = await self.get_host_rule(host) 184 | if exist_host_rule: 185 | exist_host_rule.add_crawler_rule(rule) 186 | query = "update host_rules set host_rule=:host_rule_string WHERE host = :host" 187 | return await self.db.execute( 188 | query=query, 189 | values={ 190 | 'host_rule_string': exist_host_rule.dumps(), 191 | 'host': host 192 | }) 193 | else: 194 | host_rule = HostRule(host) 195 | host_rule.add_crawler_rule(rule) 196 | query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)" 197 | return await self.db.execute( 198 | query=query, 199 | values={ 200 | 'host_rule_string': host_rule.dumps(), 201 | 'host': host 202 | }) 203 | 204 | async def pop_crawler_rule(self, rule: CrawlerRule, commit=False): 205 | query = "SELECT host_rule FROM host_rules" 206 | host = get_host(rule['request_args'].get('url')) 207 | values = {} 208 | if host: 209 | query += ' WHERE host = :host' 210 | values['host'] = host 211 | rows = await self.db.fetch_all(query=query, values=values) 212 | for row in rows: 213 | host_rule = HostRule.loads(row.host_rule) 214 | crawler_rule = host_rule.pop_crawler_rule(rule['name']) 215 | if crawler_rule: 216 | # update host_rule 217 | await self.add_host_rule(host_rule) 218 | return crawler_rule 219 | 220 | async def add_host_rule(self, rule: HostRule, commit=None): 221 | """insert or update HostRule""" 222 | # some sql not support upsert: insert replace, replace into, on conflict 223 | query = "SELECT host_rule FROM host_rules WHERE host = :host" 224 | exist_host_rule = await self.get_host_rule(rule['host']) 225 | if exist_host_rule: 226 | query = "update host_rules set host_rule=:host_rule_string WHERE host = :host" 227 | return await self.db.execute(query=query, 228 | values={ 229 | 'host_rule_string': rule.dumps(), 230 | 'host': rule['host'] 231 | }) 232 | else: 233 | query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)" 234 | return await self.db.execute(query=query, 235 | values={ 236 | 'host_rule_string': rule.dumps(), 237 | 'host': rule['host'] 238 | }) 239 | 240 | async def pop_host_rule(self, host: str, commit=None): 241 | exist_host_rule = await self.get_host_rule(host) 242 | host_rule = HostRule.loads(exist_host_rule) if exist_host_rule else None 243 | if host_rule: 244 | query = "delete FROM host_rules WHERE host = :host" 245 | await self.db.execute(query=query, values={'host': host}) 246 | return host_rule 247 | 248 | 249 | class Task(BaseModel): 250 | task_id: Optional[int] = None 251 | name: str 252 | enable: int = 0 253 | tag: str = 'default' 254 | error: str = '' 255 | request_args: str 256 | origin_url: str = '' 257 | interval: int = 300 258 | work_hours: str = '0, 24' 259 | max_result_count: int = 30 260 | latest_result: str = '{}' 261 | result_list = '[]' 262 | last_check_time: datetime = date0 263 | next_check_time: datetime = date0 264 | last_change_time: datetime = date0 265 | custom_info: str = '' 266 | 267 | 268 | class Group(BaseModel): 269 | id: Optional[int] = None 270 | name: str = '' 271 | task_ids: str = '' 272 | 273 | 274 | class Feed(BaseModel): 275 | task_id: int 276 | name: str 277 | text: str 278 | url: str 279 | ts_create: datetime 280 | 281 | 282 | class Metas(object): 283 | """Save & Load some variables with db""" 284 | 285 | def __init__(self, db: Database): 286 | self.db = db 287 | 288 | async def set(self, key, value): 289 | query = 'replace into metas (`key`, `value`) values (:key, :value)' 290 | await Config.db.execute(query, values={'key': key, 'value': value}) 291 | self.clear_cache() 292 | if (await self.get(key)) == value: 293 | return True 294 | else: 295 | return False 296 | 297 | async def remove(self, key): 298 | query = 'delete from metas where `key`=:key' 299 | await Config.db.execute(query, values={'key': key}) 300 | self.clear_cache() 301 | if not (await self.get(key)): 302 | return True 303 | else: 304 | return False 305 | 306 | @alru_cache(maxsize=Config.metas_cache_maxsize) 307 | async def _get(self, key, default=None): 308 | query = 'select `value` from metas where `key`=:key' 309 | result = await self.db.fetch_one(query, values={'key': key}) 310 | if result: 311 | return result.value 312 | else: 313 | return default 314 | 315 | async def get(self, key, default=None, cache=True): 316 | if not cache: 317 | self.clear_cache() 318 | return await self._get(key, default=default) 319 | 320 | def clear_cache(self): 321 | self._get.cache_clear() 322 | 323 | 324 | @alru_cache(maxsize=Config.query_tasks_cache_maxsize) 325 | async def query_tasks( 326 | task_name: Optional[str] = None, 327 | task_id: Optional[int] = None, 328 | page: int = 1, 329 | page_size: int = Config.default_page_size, 330 | order_by: str = 'last_change_time', 331 | sort: str = 'desc', 332 | tag: str = '', 333 | task_ids: Tuple[int] = None, 334 | ) -> Tuple[List[dict], bool]: 335 | # task_ids arg type is tuple for cache hashing 336 | offset = page_size * (page - 1) 337 | query = tasks.select() 338 | if task_ids: 339 | query = query.where(tasks.c.task_id.in_(task_ids)) 340 | else: 341 | if task_id is not None: 342 | query = query.where(tasks.c.task_id == task_id) 343 | if task_name is not None: 344 | query = query.where(tasks.c.name == task_name) 345 | if tag: 346 | query = query.where(tasks.c.tag == tag) 347 | if order_by and sort: 348 | ob = getattr(tasks.c, order_by, None) 349 | if ob is None: 350 | raise ValueError(f'bad order_by {order_by}') 351 | if sort.lower() == 'desc': 352 | ob = sqlalchemy.desc(ob) 353 | elif sort.lower() == 'asc': 354 | ob = sqlalchemy.asc(ob) 355 | else: 356 | raise ValueError( 357 | f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')") 358 | query = query.order_by(ob) 359 | query = query.limit(page_size + 1).offset(offset) 360 | _result = await Config.db.fetch_all(query=query) 361 | has_more = len(_result) > page_size 362 | result = [dict(i) for i in _result][:page_size] 363 | query_string = str(query.compile( 364 | compile_kwargs={"literal_binds": True})).replace('\n', '') 365 | Config.logger.info( 366 | f'[Query] {len(result)} tasks (has_more={has_more}): {query_string}') 367 | return result, has_more 368 | 369 | 370 | @alru_cache(maxsize=Config.query_task_ids_cache_maxsize) 371 | async def query_task_ids(task_name: Optional[str] = None, 372 | tag: str = '') -> List[int]: 373 | query = tasks.select() 374 | if task_name is not None: 375 | query = query.where(tasks.c.name == task_name) 376 | if tag: 377 | query = query.where(tasks.c.tag == tag) 378 | _result = await Config.db.fetch_all(query=query) 379 | result = [dict(i)['task_id'] for i in _result] 380 | query_string = str(query.compile( 381 | compile_kwargs={"literal_binds": True})).replace('\n', '') 382 | Config.logger.info(f'[Query] {len(result)} task ids: {query_string}') 383 | return result 384 | 385 | 386 | @alru_cache(maxsize=Config.query_group_task_ids_cache_maxsize) 387 | async def query_group_task_ids( 388 | group_id: int = None, 389 | group_ids: Union[str, Tuple[int]] = None, 390 | ) -> List[int]: 391 | _group_ids: Set[int] = set() 392 | if group_id: 393 | _group_ids.add(int(group_id)) 394 | if group_ids: 395 | if isinstance(group_ids, str): 396 | for _group_id in re.findall(r'\d+', group_ids): 397 | _group_ids.add(int(_group_id)) 398 | elif isinstance(group_ids, tuple): 399 | _group_ids.add(int(_group_id)) 400 | task_ids: Set[int] = set() 401 | for _group_id in _group_ids: 402 | query = groups.select() 403 | query = query.where(groups.c.id == _group_id) 404 | _result = await Config.db.fetch_one(query=query) 405 | if _result: 406 | task_ids_str = dict(_result).get('task_ids') or '' 407 | for task_id in re.findall(r'\d+', task_ids_str): 408 | task_ids.add(int(task_id)) 409 | query_string = str(query.compile( 410 | compile_kwargs={"literal_binds": True})).replace('\n', '') 411 | Config.logger.info( 412 | f'[Query] {len(task_ids)} task_ids by group {group_id or group_ids}: {query_string}' 413 | ) 414 | return list(task_ids) 415 | 416 | 417 | @alru_cache(maxsize=Config.query_feeds_cache_maxsize) 418 | async def query_feeds( 419 | task_name: Optional[str] = None, 420 | task_id: Optional[int] = None, 421 | page: int = 1, 422 | page_size: int = Config.default_page_size, 423 | order_by: str = 'id', 424 | sort: str = 'desc', 425 | tag: str = '', 426 | task_ids: Tuple[int] = None, 427 | ) -> Tuple[List[dict], bool]: 428 | # task_ids arg type is tuple for cache hashing 429 | offset = page_size * (page - 1) 430 | query = feeds.select() 431 | _task_ids: List[int] = [] 432 | if task_ids: 433 | _task_ids.extend(task_ids) 434 | if tag: 435 | _task_ids += await query_task_ids(tag=tag) 436 | if _task_ids: 437 | query = query.where(feeds.c.task_id.in_(tuple(_task_ids))) 438 | else: 439 | if task_id is not None: 440 | query = query.where(feeds.c.task_id == task_id) 441 | if task_name is not None: 442 | query = query.where(feeds.c.name == task_name) 443 | if order_by and sort: 444 | ob = getattr(feeds.c, order_by, None) 445 | if ob is None: 446 | raise ValueError(f'bad order_by {order_by}') 447 | if sort.lower() == 'desc': 448 | ob = sqlalchemy.desc(ob) 449 | elif sort.lower() == 'asc': 450 | ob = sqlalchemy.asc(ob) 451 | else: 452 | raise ValueError( 453 | f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')") 454 | query = query.order_by(ob) 455 | query = query.limit(page_size + 1).offset(offset) 456 | _result = await Config.db.fetch_all(query=query) 457 | has_more = len(_result) > page_size 458 | result = [dict(i) for i in _result][:page_size] 459 | query_string = str(query.compile( 460 | compile_kwargs={"literal_binds": True})).replace('\n', '') 461 | Config.logger.info( 462 | f'[Query] {len(result)} feeds (has_more={has_more}): {query_string}') 463 | return result, has_more 464 | 465 | 466 | @alru_cache(maxsize=Config.query_groups_cache_maxsize) 467 | async def query_all_groups() -> List[dict]: 468 | query = groups.select() 469 | rows = await Config.db.fetch_all(query=query) 470 | result = [] 471 | for row in rows: 472 | result.append(dict(row)) 473 | query_string = str(query.compile( 474 | compile_kwargs={"literal_binds": True})).replace('\n', '') 475 | Config.logger.info(f'[Query] {len(result)} groups: {query_string}') 476 | return result 477 | 478 | 479 | async def query_task_errors(tag: str = '', 480 | task_ids: Iterable = None) -> List[dict]: 481 | query = tasks.select().with_only_columns(tasks.c.name, tasks.c.error) 482 | if task_ids: 483 | query = query.where(tasks.c.task_id.in_(tuple(task_ids))) 484 | query = query.where(tasks.c.error != '') 485 | query = query.where(tasks.c.enable == 1) 486 | if tag: 487 | query = query.where(tasks.c.tag == tag) 488 | query = query.order_by(sqlalchemy.desc('last_change_time')) 489 | _result: list = await Config.db.fetch_all(query=query) 490 | result = [dict(task) for task in _result] 491 | query_string = str(query.compile( 492 | compile_kwargs={"literal_binds": True})).replace('\n', '') 493 | Config.logger.info(f'[Query] {len(result)} task errors: {query_string}') 494 | return result 495 | -------------------------------------------------------------------------------- /watchdogs/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Watchdogs v{{version}} 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 202 | 203 | 204 | 205 | 207 | 208 | 209 | 211 | 212 | 213 | 215 | 216 | 217 | 218 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | Rule 228 | 229 | Host 230 | 231 | 232 | | "retry": 2, "timeout": 3, "ssl": false, and other args refer to 233 | aiohttp 235 | 236 | 238 | 239 | 240 | 242 | 243 | 244 | 5 mins 245 | 246 | | 247 | 10 mins 248 | | 249 | 30 mins 250 | | 251 | 1 hrs 252 | | 253 | 3 hrs 254 | | 255 | 6 hrs 256 | | 257 | 12 hrs 258 | | 259 | 1 day 260 | | 261 | 7 days 262 | | 263 | 30 days 264 | 266 | 267 | 268 | 269 | 271 | 272 | 273 | 274 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 284 | Callbacks: 285 |
286 | ${name||'default-callback'} 287 |
288 |
${current_cb_doc}
290 |
291 |
292 | 296 |
297 | 299 | Delete 300 | 301 | 302 |

Frequency: Send [n] request 303 | each [interval] seconds

304 | 305 | 309 | 310 | 311 | 312 | 313 | 315 | 316 | 317 | 318 | 319 | Update Frequency 320 | 321 |
322 | 323 | 324 |

Name

325 |
326 | 327 |

Regex

328 |
329 | 330 |

/

331 |
332 |
333 |
334 | 335 | 336 | 338 | ${rule.name} 339 | 340 | 341 | 342 | ${rule.regex} 343 | 344 | 345 | 347 | 349 | 351 | 352 | 353 |
354 |
355 |
356 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | -------------------------------------------------------------------------------- /watchdogs/app.py: -------------------------------------------------------------------------------- 1 | from base64 import b64encode 2 | from collections import deque 3 | from datetime import datetime 4 | from json import JSONDecodeError, dumps, loads 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | import aiofiles 9 | from fastapi import Cookie, FastAPI, Header 10 | from fastapi.staticfiles import StaticFiles 11 | from starlette.requests import Request 12 | from starlette.responses import ( 13 | FileResponse, 14 | HTMLResponse, 15 | JSONResponse, 16 | RedirectResponse, 17 | Response, 18 | ) 19 | from starlette.templating import Jinja2Templates 20 | from torequests.utils import timeago, ttime 21 | from uniparser import CrawlerRule, Uniparser 22 | from uniparser.fastapi_ui import app as sub_app 23 | from uniparser.utils import get_host 24 | 25 | from . import __version__ 26 | from .config import md5_checker 27 | from .crawler import crawl_once, find_next_check_time 28 | from .models import ( 29 | Group, 30 | Task, 31 | groups, 32 | query_all_groups, 33 | query_feeds, 34 | query_group_task_ids, 35 | query_task_errors, 36 | query_tasks, 37 | tasks, 38 | ) 39 | from .settings import ( 40 | Config, 41 | get_host_freq_list, 42 | refresh_token, 43 | release_app, 44 | set_host_freq, 45 | setup_app, 46 | ) 47 | from .utils import format_size, gen_rss 48 | 49 | description = "Watchdogs to keep an eye on the world's change.\nRead more: [https://github.com/ClericPy/watchdogs](https://github.com/ClericPy/watchdogs)" 50 | app = FastAPI(title="Watchdogs", description=description, version=__version__) 51 | 52 | Config.setup_middleware(app) 53 | sub_app.openapi_prefix = '/uniparser' 54 | app.mount("/uniparser", sub_app) 55 | app.mount("/static", 56 | StaticFiles(directory=str((Path(__file__).parent / 57 | 'static').absolute())), 58 | name="static") 59 | logger = Config.logger 60 | templates = Jinja2Templates(directory=str((Path(__file__).parent / 61 | 'templates').absolute())) 62 | 63 | 64 | @app.on_event("startup") 65 | async def startup(): 66 | await setup_app(app) 67 | 68 | 69 | @app.on_event("shutdown") 70 | async def shutdown(): 71 | await release_app(app) 72 | 73 | 74 | @app.post('/auth') 75 | async def post_auth(request: Request, 76 | watchdog_auth: str = Cookie(''), 77 | redirect: str = '/'): 78 | # two scene for set new password, update new password if has password, else return the html 79 | # 1. not set watchdog_auth; 2. already authenticated 80 | password = loads(await request.body())['password'] 81 | auth_not_set = not Config.watchdog_auth 82 | already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth 83 | need_new_pwd = auth_not_set or already_authed 84 | if password: 85 | if need_new_pwd: 86 | old_password = Config.password 87 | Config.password = password 88 | await refresh_token() 89 | resp = JSONResponse({'ok': True, 'redirect': redirect}) 90 | resp.set_cookie('watchdog_auth', 91 | Config.watchdog_auth, 92 | max_age=Config.cookie_max_age, 93 | httponly=True) 94 | logger.warning( 95 | f'password changed {old_password}->{Config.password}.') 96 | return resp 97 | elif (await md5_checker(password, Config.watchdog_auth, freq=True)): 98 | resp = JSONResponse({'ok': True, 'redirect': redirect}) 99 | resp.set_cookie('watchdog_auth', 100 | Config.watchdog_auth, 101 | max_age=Config.cookie_max_age, 102 | httponly=True) 103 | logger.info('correct password, login success.') 104 | return resp 105 | # invalid password, clear cookie 106 | resp = JSONResponse({'ok': False}) 107 | # resp.set_cookie('watchdog_auth', '') 108 | resp.delete_cookie('watchdog_auth') 109 | logger.info(f'invalid password: {password}') 110 | return resp 111 | 112 | 113 | @app.get('/auth') 114 | async def auth(request: Request, 115 | watchdog_auth: str = Cookie(''), 116 | redirect: str = '/'): 117 | auth_not_set = not Config.watchdog_auth 118 | already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth 119 | need_new_pwd = auth_not_set or already_authed 120 | context: dict = {'request': request} 121 | context['version'] = __version__ 122 | if need_new_pwd: 123 | context['action'] = 'Init' 124 | context['prompt_title'] = 'Set a new password' 125 | else: 126 | context['action'] = 'Login' 127 | context['prompt_title'] = 'Input the password' 128 | return templates.TemplateResponse("auth.html", context=context) 129 | 130 | 131 | @app.get("/") 132 | async def index(request: Request, tag: str = ''): 133 | kwargs: dict = {'request': request} 134 | kwargs['cdn_urls'] = Config.cdn_urls 135 | kwargs['version'] = __version__ 136 | kwargs['rss_url'] = Config.get_route('/rss', tag=tag) 137 | kwargs['lite_url'] = Config.get_route('/lite', tag=tag) 138 | kwargs['feeds_url'] = Config.get_route('/feeds', tag=tag) 139 | kwargs['rss_feeds_url'] = Config.get_route('/rss_feeds', tag=tag) 140 | init_vars_json = dumps({ 141 | 'custom_links': Config.custom_links, 142 | 'callback_workers': Config.callback_handler.workers, 143 | 'custom_tabs': Config.custom_tabs, 144 | 'work_hours_doc': find_next_check_time.__doc__, 145 | }) 146 | init_vars_b64 = b64encode(init_vars_json.encode('u8')).decode('u8') 147 | kwargs['init_vars'] = init_vars_b64 148 | return templates.TemplateResponse("index.html", context=kwargs) 149 | 150 | 151 | @app.get("/favicon.ico") 152 | async def favicon(): 153 | return RedirectResponse('/static/img/favicon.svg', 301) 154 | 155 | 156 | @app.post("/add_new_task") 157 | async def add_new_task(task: Task): 158 | try: 159 | if task.interval < 60: 160 | raise ValueError('interval should not less than 60 seconds.') 161 | db = Config.db 162 | # check exist 163 | if task.task_id is None: 164 | # insert new task 165 | query = tasks.insert() 166 | values = dict(task) 167 | if not values.get('error'): 168 | values['error'] = '' 169 | # insert with task_id is None 170 | await db.execute(query=query, values=values) 171 | else: 172 | # update old task 173 | query = 'update tasks set `name`=:name,`enable`=:enable,`tag`=:tag,`request_args`=:request_args,`origin_url`=:origin_url,`interval`=:interval,`work_hours`=:work_hours,`max_result_count`=:max_result_count,`custom_info`=:custom_info,`next_check_time`=:next_check_time where `task_id`=:task_id' 174 | values = { 175 | 'task_id': task.task_id, 176 | 'name': task.name, 177 | 'enable': task.enable, 178 | 'tag': task.tag, 179 | 'request_args': task.request_args, 180 | 'origin_url': task.origin_url, 181 | 'interval': task.interval, 182 | 'work_hours': task.work_hours, 183 | 'max_result_count': task.max_result_count, 184 | 'custom_info': task.custom_info, 185 | 'next_check_time': datetime.now(), 186 | } 187 | await db.execute(query=query, values=values) 188 | result = {'msg': 'ok'} 189 | query_tasks.cache_clear() 190 | except Exception as e: 191 | result = {'msg': repr(e)} 192 | logger.info( 193 | f'{"[Add]" if task.task_id is None else "[Update]"} task {task}: {result}' 194 | ) 195 | return result 196 | 197 | 198 | @app.get("/delete_task") 199 | async def delete_task(task_id: int): 200 | try: 201 | query = tasks.delete().where(tasks.c.task_id == task_id) 202 | await Config.db.execute(query=query) 203 | result = {'msg': 'ok'} 204 | query_tasks.cache_clear() 205 | except Exception as e: 206 | result = {'msg': repr(e)} 207 | logger.info(f'[Delete] task {task_id}: {result}') 208 | return result 209 | 210 | 211 | @app.get("/force_crawl") 212 | async def force_crawl(task_name: str): 213 | try: 214 | task = await crawl_once(task_name=task_name) 215 | task['timeago'] = timeago( 216 | (datetime.now() - task['last_change_time']).total_seconds(), 217 | 1, 218 | 1, 219 | short_name=True) 220 | result = {'msg': 'ok', 'task': task} 221 | except Exception as e: 222 | result = {'msg': repr(e)} 223 | logger.info(f'[Force] crawl {task_name}: {result}') 224 | return result 225 | 226 | 227 | @app.get("/load_tasks") 228 | async def load_tasks( 229 | task_name: Optional[str] = None, 230 | page: int = 1, 231 | page_size: int = Config.default_page_size, 232 | order_by: str = 'last_change_time', 233 | sort: str = 'desc', 234 | tag: str = '', 235 | ): 236 | try: 237 | _result, has_more = await query_tasks( 238 | task_name=task_name, 239 | page=page, 240 | page_size=page_size, 241 | order_by=order_by, 242 | sort=sort, 243 | tag=tag, 244 | ) 245 | _result = [task for task in _result] 246 | now = datetime.now() 247 | for item in _result: 248 | item['timeago'] = timeago( 249 | (now - item['last_change_time']).total_seconds(), 250 | 1, 251 | 1, 252 | short_name=True) 253 | result = {'msg': 'ok', 'tasks': _result, 'has_more': has_more} 254 | except Exception as e: 255 | result = {'msg': str(e), 'tasks': [], 'has_more': False} 256 | return result 257 | 258 | 259 | @app.get("/enable_task") 260 | async def enable_task(task_id: int, enable: int = 1): 261 | query = 'update tasks set `enable`=:enable where `task_id`=:task_id' 262 | values = {'task_id': task_id, 'enable': enable} 263 | try: 264 | _result = await Config.db.execute(query, values) 265 | result = {'msg': 'ok', 'updated': _result} 266 | query_tasks.cache_clear() 267 | except Exception as e: 268 | result = {'msg': repr(e)} 269 | return result 270 | 271 | 272 | @app.get('/load_hosts') 273 | async def load_hosts(host: str = ''): 274 | host = get_host(host) or host 275 | query = 'select `host` from host_rules' 276 | if host: 277 | query += ' where `host` like :host' 278 | values = {'host': f'%{host}%'} 279 | else: 280 | values = {} 281 | query += ' order by `host` asc' 282 | _result = await Config.db.fetch_all(query, values) 283 | host_freqs = Uniparser._HOST_FREQUENCIES 284 | hosts = [{ 285 | 'name': getattr(i, 'host', None), 286 | 'freq': getattr(i, 'host', None) in host_freqs 287 | } for i in _result] 288 | return {'hosts': hosts, 'host': host} 289 | 290 | 291 | @app.get("/get_host_rule") 292 | async def get_host_rule(host: str): 293 | try: 294 | if not host: 295 | raise ValueError('host name should not be null') 296 | query = 'select `host_rule` from host_rules where `host`=:host' 297 | values = {'host': host} 298 | _result = await Config.db.fetch_one(query, values) 299 | host_rule = getattr(_result, 'host_rule', None) 300 | host_rule = loads(host_rule) if host_rule else {"host": host} 301 | host_rule['n'], host_rule['interval'] = get_host_freq_list(host) 302 | result = {'msg': 'ok', 'host_rule': host_rule} 303 | except Exception as e: 304 | result = {'msg': repr(e)} 305 | logger.info(f'[Get] host_rule {host}: {result}') 306 | return result 307 | 308 | 309 | @app.post("/crawler_rule.{method}") 310 | async def crawler_rule(method: str, 311 | rule: CrawlerRule, 312 | force: Optional[int] = 0): 313 | try: 314 | if not rule['name']: 315 | raise ValueError('rule name can not be null') 316 | if method == 'add': 317 | if force: 318 | exist_rule = await Config.rule_db.find_crawler_rule( 319 | rule['request_args']['url']) 320 | if exist_rule: 321 | logger.info( 322 | f'add crawler_rule force=1, old rule removed: {exist_rule}' 323 | ) 324 | await Config.rule_db.pop_crawler_rule(exist_rule) 325 | _result = await Config.rule_db.add_crawler_rule(rule) 326 | elif method == 'pop': 327 | _result = await Config.rule_db.pop_crawler_rule(rule) 328 | else: 329 | raise ValueError('method only support add and pop') 330 | result = {'msg': 'ok', 'result': _result} 331 | except Exception as e: 332 | result = {'msg': repr(e)} 333 | logger.info(f'[{method.title()}] crawler rule {rule}: {result}') 334 | return result 335 | 336 | 337 | @app.post("/find_crawler_rule") 338 | async def find_crawler_rule(request_args: dict): 339 | try: 340 | url = request_args.get('url') 341 | rule: CrawlerRule = await Config.rule_db.find_crawler_rule(url) 342 | if not rule: 343 | raise ValueError(f'rule not found for given url: {url}') 344 | result = {'msg': 'ok', 'result': rule.dumps()} 345 | except Exception as e: 346 | result = {'msg': repr(e)} 347 | logger.info(f'[Find] crawler rule: {result}') 348 | return result 349 | 350 | 351 | @app.get("/delete_host_rule") 352 | async def delete_host_rule(host: str): 353 | try: 354 | if not host: 355 | raise ValueError('host should not be null') 356 | await Config.rule_db.pop_host_rule(host) 357 | result = {'msg': 'ok'} 358 | except Exception as e: 359 | result = {'msg': repr(e)} 360 | logger.info(f'[Delete] host rule {host}: {result}') 361 | return result 362 | 363 | 364 | @app.get("/log") 365 | async def log(request: Request, 366 | max_lines: int = 50, 367 | refresh_every: int = 0, 368 | log_names: str = 'info-server-error'): 369 | window: deque = deque((), max_lines) 370 | names: list = log_names.split('-') 371 | items = [] 372 | for name in names: 373 | file_name = f'{name}.log' 374 | fp: Path = Config.CONFIG_DIR / file_name 375 | if not fp.is_file(): 376 | continue 377 | fp_stat = fp.stat() 378 | file_size = format_size(fp_stat.st_size) 379 | st_mtime = ttime(fp_stat.st_mtime) 380 | line_no = 0 381 | async with aiofiles.open(fp, encoding=Config.ENCODING) as f: 382 | async for line in f: 383 | line_no += 1 384 | window.append(line) 385 | item = { 386 | 'name': name, 387 | 'line_no': line_no, 388 | 'file_size': file_size, 389 | 'st_mtime': st_mtime, 390 | 'log_text': "".join(window), 391 | 'file_size_mb': Config.LOGGING_FILE_CONFIG.get(file_name, {}).get( 392 | 'file_size_mb', '-1'), 393 | } 394 | items.append(item) 395 | window.clear() 396 | context = { 397 | 'request': request, 398 | 'items': items, 399 | 'log_names': log_names, 400 | 'refresh_every': refresh_every, 401 | 'max_lines': max_lines, 402 | } 403 | return templates.TemplateResponse("logs.html", context=context) 404 | 405 | 406 | @app.get("/log.clear") 407 | async def log_clear(log_names: str = 'info-server-error', 408 | current_names: str = 'info-server-error'): 409 | names: list = log_names.split('-') 410 | for name in names: 411 | fp: Path = Config.CONFIG_DIR / f'{name}.log' 412 | if not fp.is_file(): 413 | continue 414 | # use sync writing to block the main thread 415 | fp.write_bytes(b'') 416 | logger.info(f'{name}.log cleared') 417 | html = f'{log_names} log cleared. Redirecting back.' 418 | return HTMLResponse(html) 419 | 420 | 421 | @app.get("/update_host_freq") 422 | async def update_host_freq(host: str, 423 | n: Optional[int] = 0, 424 | interval: Optional[int] = 0): 425 | try: 426 | if not host: 427 | raise ValueError('host should not be null') 428 | await set_host_freq(host, n=n, interval=interval) 429 | result = {'msg': 'ok'} 430 | except Exception as e: 431 | result = {'msg': repr(e)} 432 | logger.info(f'[Update] host frequency {host}: {result}') 433 | return result 434 | 435 | 436 | @app.get("/rss") 437 | async def rss( 438 | request: Request, 439 | tag: str = '', 440 | sign: str = '', 441 | host: str = Header('', alias='Host'), 442 | group_ids: str = '', 443 | ): 444 | if group_ids: 445 | task_ids = tuple(await query_group_task_ids(group_ids)) 446 | if not task_ids: 447 | return JSONResponse( 448 | status_code=404, 449 | content={ 450 | "message": 'query no tasks', 451 | }, 452 | ) 453 | tasks, _ = await query_tasks(task_ids=task_ids) 454 | else: 455 | tasks, _ = await query_tasks(tag=tag) 456 | source_link = f'https://{host}' 457 | xml_data: dict = { 458 | 'channel': { 459 | 'title': 'Watchdogs', 460 | 'description': f'Watchdog on web change, v{__version__}.', 461 | 'link': source_link, 462 | }, 463 | 'items': [] 464 | } 465 | for task in tasks: 466 | pubDate: str = task['last_change_time'].strftime( 467 | format='%a, %d %b %Y %H:%M:%S') 468 | latest_result: dict = loads(task['latest_result'] or '{}') 469 | if isinstance(latest_result, list): 470 | logger.error(f'latest_result is list: {latest_result}') 471 | link: str = latest_result.get('url') or task['origin_url'] 472 | description: str = latest_result.get('text') or '' 473 | title: str = f'{task["name"]}#{latest_result.get("title", description[:Config.TEXT_SLICE_LENGTH])}' 474 | item: dict = { 475 | 'title': title, 476 | 'link': link, 477 | 'guid': title, 478 | 'description': description, 479 | 'pubDate': pubDate 480 | } 481 | xml_data['items'].append(item) 482 | xml: str = gen_rss(xml_data) 483 | response = Response( 484 | content=xml, 485 | media_type="application/xml", 486 | headers={'Content-Type': 'application/xml; charset="utf-8"'}) 487 | return response 488 | 489 | 490 | @app.post("/lite") 491 | async def post_lite(request: Request, tag: str = '', sign: str = ''): 492 | task_id = loads(await request.body())['task_id'] 493 | tasks, _ = await query_tasks(task_id=task_id) 494 | if tasks: 495 | task = tasks[0] 496 | try: 497 | result_list = loads( 498 | task['result_list']) if task['result_list'] else [] 499 | except JSONDecodeError: 500 | result_list = [] 501 | return {'result_list': result_list} 502 | else: 503 | return {'result_list': []} 504 | 505 | 506 | @app.get("/lite") 507 | async def lite( 508 | request: Request, 509 | tag: str = '', 510 | sign: str = '', 511 | page: int = 1, 512 | group_ids: str = '', 513 | ): 514 | if group_ids: 515 | task_ids = tuple(await query_group_task_ids(group_ids)) 516 | if not task_ids: 517 | return JSONResponse( 518 | status_code=404, 519 | content={ 520 | "message": 'query no tasks', 521 | }, 522 | ) 523 | tasks, has_more = await query_tasks(task_ids=task_ids, page=page) 524 | else: 525 | tasks, has_more = await query_tasks(tag=tag, page=page) 526 | now = datetime.now() 527 | for task in tasks: 528 | result = loads(task['latest_result'] or '{}') 529 | # set / get cache from task 530 | task['url'] = task.get('url') or result.get('url') or task['origin_url'] 531 | task['text'] = task.get('text') or result.get('title') or result.get( 532 | 'text') or '' 533 | task['timeago'] = timeago( 534 | (now - task['last_change_time']).total_seconds(), 535 | 1, 536 | 1, 537 | short_name=True) 538 | context = {'tasks': tasks, 'request': request} 539 | context['version'] = __version__ 540 | if group_ids: 541 | params = {'group_ids': group_ids} 542 | else: 543 | params = {'tag': tag} 544 | context['home_url'] = Config.get_route('/lite', **params) 545 | if has_more: 546 | if group_ids: 547 | next_page_url = Config.get_route('/lite', page=page + 1, **params) 548 | else: 549 | next_page_url = Config.get_route('/lite', page=page + 1, **params) 550 | else: 551 | next_page_url = '' 552 | context['next_page_url'] = next_page_url 553 | if page > 1: 554 | if group_ids: 555 | last_page_url = Config.get_route('/lite', page=page - 1, **params) 556 | else: 557 | last_page_url = Config.get_route('/lite', page=page - 1, **params) 558 | else: 559 | last_page_url = '' 560 | context['last_page_url'] = last_page_url 561 | context['rss_url'] = Config.get_route('/rss', **params) 562 | return templates.TemplateResponse("lite.html", context=context) 563 | 564 | 565 | @app.get("/feeds") 566 | async def feeds( 567 | request: Request, 568 | tag: str = '', 569 | # user: str = '', 570 | sign: str = '', 571 | page: int = 1, 572 | # page_size: int = Config.default_page_size, 573 | group_ids: str = '', 574 | ): 575 | error_tasks = [] 576 | if group_ids: 577 | task_ids = tuple(await query_group_task_ids(group_ids)) 578 | if not task_ids: 579 | return JSONResponse( 580 | status_code=404, 581 | content={ 582 | "message": 'query no tasks', 583 | }, 584 | ) 585 | feeds, has_more = await query_feeds(task_ids=task_ids, 586 | tag=tag, 587 | page=page) 588 | if page == 1: 589 | error_tasks.extend(await query_task_errors(tag=tag, 590 | task_ids=task_ids)) 591 | else: 592 | feeds, has_more = await query_feeds(tag=tag, page=page) 593 | if page == 1: 594 | error_tasks.extend(await query_task_errors(tag=tag)) 595 | now = datetime.now() 596 | _feeds = [] 597 | current_date = None 598 | today = datetime.today().strftime('%Y-%m-%d') 599 | for feed in feeds: 600 | date = feed['ts_create'].strftime('%Y-%m-%d') 601 | if date != current_date: 602 | current_date = date 603 | if date == today: 604 | date += ' [Today]' 605 | _feeds.append({'current_date': date}) 606 | feed['timeago'] = timeago((now - feed['ts_create']).total_seconds(), 607 | 1, 608 | 1, 609 | short_name=True) 610 | _feeds.append(feed) 611 | context = {'feeds': _feeds, 'request': request, 'error_tasks': error_tasks} 612 | context['version'] = __version__ 613 | if group_ids: 614 | params = {'group_ids': group_ids} 615 | else: 616 | params = {'tag': tag} 617 | context['home_url'] = Config.get_route('/feeds', **params) 618 | if has_more: 619 | if group_ids: 620 | next_page_url = Config.get_route('/feeds', page=page + 1, **params) 621 | else: 622 | next_page_url = Config.get_route('/feeds', page=page + 1, **params) 623 | else: 624 | next_page_url = '' 625 | context['next_page_url'] = next_page_url 626 | if page > 1: 627 | if group_ids: 628 | last_page_url = Config.get_route('/feeds', page=page - 1, **params) 629 | else: 630 | last_page_url = Config.get_route('/feeds', page=page - 1, **params) 631 | else: 632 | last_page_url = '' 633 | context['last_page_url'] = last_page_url 634 | context['rss_url'] = Config.get_route('/rss_feeds', **params) 635 | return templates.TemplateResponse("feeds.html", context=context) 636 | 637 | 638 | @app.get("/rss_feeds") 639 | async def rss_feeds(request: Request, 640 | tag: str = '', 641 | sign: str = '', 642 | host: str = Header('', alias='Host'), 643 | group_ids: str = ''): 644 | if group_ids: 645 | task_ids = tuple(await query_group_task_ids(group_ids)) 646 | if not task_ids: 647 | return JSONResponse( 648 | status_code=404, 649 | content={ 650 | "message": 'query no tasks', 651 | }, 652 | ) 653 | feeds, _ = await query_feeds(task_ids=task_ids) 654 | else: 655 | feeds, _ = await query_feeds(tag=tag) 656 | source_link = f'https://{host}' 657 | xml_data: dict = { 658 | 'channel': { 659 | 'title': 'Watchdogs Timeline', 660 | 'description': f'Watchdog on web change, v{__version__}.', 661 | 'link': source_link, 662 | }, 663 | 'items': [] 664 | } 665 | for feed in feeds: 666 | pubDate: str = feed['ts_create'].strftime( 667 | format='%a, %d %b %Y %H:%M:%S') 668 | link: str = feed['url'] 669 | description: str = feed['text'] 670 | title: str = f'{feed["name"]}#{description[:Config.TEXT_SLICE_LENGTH]}' 671 | item: dict = { 672 | 'title': title, 673 | 'link': link, 674 | 'guid': str(feed['id']), 675 | 'description': description, 676 | 'pubDate': pubDate 677 | } 678 | xml_data['items'].append(item) 679 | xml: str = gen_rss(xml_data) 680 | response = Response( 681 | content=xml, 682 | media_type="application/xml", 683 | headers={'Content-Type': 'application/xml; charset="utf-8"'}) 684 | return response 685 | 686 | 687 | @app.get("/groups") 688 | async def groups_route(request: Request): 689 | groups = await query_all_groups() 690 | for _group in groups: 691 | _group['href_feeds'] = Config.get_route('/feeds', 692 | group_ids=_group['id']) 693 | _group['href_lite'] = Config.get_route('/lite', group_ids=_group['id']) 694 | context = { 695 | 'request': request, 696 | 'groups': groups, 697 | } 698 | return templates.TemplateResponse("groups.html", context=context) 699 | 700 | 701 | @app.post("/update_group") 702 | async def update_group(group: Group, action: str): 703 | try: 704 | db = Config.db 705 | # check exist 706 | if action == 'new': 707 | # insert new task 708 | query = groups.insert() 709 | values = dict(group) 710 | # insert with task_id is None 711 | resp = await db.execute(query=query, values=values) 712 | elif action == 'delete': 713 | query = 'delete from groups where `id`=:id' 714 | values = {'id': group.id} 715 | resp = await db.execute(query=query, values=values) 716 | else: 717 | # update old task 718 | query = 'update groups set `name`=:name,`task_ids`=:task_ids where `id`=:id' 719 | values = { 720 | 'id': group.id, 721 | 'name': group.name, 722 | 'task_ids': group.task_ids, 723 | } 724 | resp = await db.execute(query=query, values=values) 725 | result = {'msg': 'ok', 'resp': str(resp)} 726 | except Exception as e: 727 | result = {'msg': repr(e)} 728 | finally: 729 | query_all_groups.cache_clear() 730 | query_group_task_ids.cache_clear() 731 | logger.info(f'[{action.title()}] {group}: {result}') 732 | return result 733 | 734 | 735 | @app.get("/sqlite") 736 | async def download_db(): 737 | if Config.db_url.startswith('sqlite:///'): 738 | return FileResponse(path=Config.db_url.replace('sqlite:///', '')) 739 | return Response(content=b'not sqlite', status_code=404) 740 | --------------------------------------------------------------------------------