├── images
    ├── 1.png
    ├── 2.png
    ├── 3.png
    ├── 4.png
    ├── 5.png
    ├── d1.png
    ├── d2.png
    ├── d3.png
    ├── d4.png
    └── d5.png
├── watchdogs
    ├── __main__.py
    ├── static
    │   ├── css
    │   │   ├── fonts
    │   │   │   └── element-icons.woff
    │   │   ├── watchdogs.min.css
    │   │   └── watchdogs.css
    │   ├── img
    │   │   └── favicon.svg
    │   └── js
    │   │   ├── clipboard.min.js
    │   │   ├── watchdogs.min.js
    │   │   └── vue-resource.min.js
    ├── __init__.py
    ├── background.py
    ├── templates
    │   ├── auth.html
    │   ├── logs.html
    │   ├── groups.html
    │   ├── feeds.html
    │   ├── lite.html
    │   └── index.html
    ├── main.py
    ├── callbacks.py
    ├── config.py
    ├── utils.py
    ├── settings.py
    ├── crawler.py
    ├── models.py
    └── app.py
├── run_server.py
├── requirements.txt
├── LICENSE
├── .gitignore
├── setup.py
├── quick_start.md
└── README.md


/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/1.png


--------------------------------------------------------------------------------
/images/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/2.png


--------------------------------------------------------------------------------
/images/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/3.png


--------------------------------------------------------------------------------
/images/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/4.png


--------------------------------------------------------------------------------
/images/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/5.png


--------------------------------------------------------------------------------
/images/d1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d1.png


--------------------------------------------------------------------------------
/images/d2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d2.png


--------------------------------------------------------------------------------
/images/d3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d3.png


--------------------------------------------------------------------------------
/images/d4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d4.png


--------------------------------------------------------------------------------
/images/d5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/images/d5.png


--------------------------------------------------------------------------------
/watchdogs/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/watchdogs/static/css/fonts/element-icons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClericPy/watchdogs/HEAD/watchdogs/static/css/fonts/element-icons.woff


--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
1 | from watchdogs.__main__ import main
2 | 
3 | if __name__ == "__main__":
4 |     # 1. pip install watchdogs
5 |     # 2. python -m watchdogs
6 |     main()
7 | 


--------------------------------------------------------------------------------
/watchdogs/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | from .config import Config
4 | from .main import init_app
5 | 
6 | __version__ = '2.0.1'
7 | __all__ = ['Config', 'init_app']
8 | logging.getLogger('watchdogs').addHandler(logging.NullHandler())
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiofiles
 2 | async_lru
 3 | beautifulsoup4
 4 | SQLAlchemy==1.4.41
 5 | databases>=0.5.5
 6 | pydantic<2.0.0
 7 | fastapi
 8 | fire
 9 | jinja2
10 | jmespath
11 | jsonpath-rw-ext
12 | lxml
13 | objectpath
14 | pyyaml>=5.3
15 | selectolax
16 | toml
17 | torequests>=5.0.4
18 | uniparser>=3.0.2
19 | uvicorn
20 | 


--------------------------------------------------------------------------------
/watchdogs/static/css/watchdogs.min.css:
--------------------------------------------------------------------------------
1 | .full-screen,body,.el-tabs__content,.el-tabs__content > *{width:100%;height:100%;}html{margin:0 auto;zoom:90%;width:90%;height:90%;}.el-tabs__item{font-weight:bold;}.el-message-box--center{min-width:50%;}.el-message-box{width:auto;}.time-td{min-width:16em;padding-left:3em;}#input_host_form > .el-form-item:first-child .el-form-item__content,#input_host_form > .el-form-item:first-child{width:100%;}[aria-label='Edit Crawler JSON'] .el-textarea__inner{height:10em;}.el-table_1_column_8 > .cell{white-space:nowrap;}div.foot{display:flex;justify-content:center;}.host-tag{margin:0.5em;cursor:pointer;}.el-table .warning-row{background:oldlace;}.cb_name{cursor:pointer;padding-left:1em;}p.custom_links{text-align:center;color:black;background-color:rgba(223,223,223,0.5);padding:0.5em 0 0.5em 0;box-shadow:3px 3px 5px #888888;}.request_args_pre{font-size:0.9em;}[v-cloak]{display:none;}.el-popover{max-width:50%;}.el-message-box.work_hours_doc{width:40%;}pre{word-wrap:break-word;white-space:pre-wrap;}
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 ClericPy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/watchdogs/background.py:
--------------------------------------------------------------------------------
 1 | from asyncio import ensure_future, sleep
 2 | from .utils import check_work_time, solo, try_catch
 3 | from .config import Config
 4 | 
 5 | 
 6 | async def crawl_chunks(crawl_once):
 7 |     loop_num = 0
 8 |     while not Config.is_shutdown:
 9 |         has_more = await crawl_once()
10 |         if isinstance(has_more, Exception):
11 |             Config.logger.error(f'crawl_once error, {has_more!r}')
12 |             break
13 |         Config.logger.info(
14 |             f'crawl_once finished, has_more: {has_more}, loop: {loop_num}')
15 |         if not has_more:
16 |             break
17 |         loop_num += 1
18 | 
19 | 
20 | async def background_loop(coro_funcs: list = None):
21 |     while not Config.is_shutdown:
22 |         # non-block running, and be constrained by SoloLock class
23 |         for func in coro_funcs:
24 |             if func.__name__ == 'crawl_once':
25 |                 ensure_future(try_catch(crawl_chunks, func))
26 |             else:
27 |                 ensure_future(try_catch(func))
28 |         await sleep(Config.check_interval)
29 | 
30 | 
31 | async def db_backup_handler():
32 |     logger = Config.logger
33 |     if check_work_time(Config.db_backup_time):
34 |         logger.warning(f'Backup DB start: {Config.db_backup_time}.')
35 |         # may raise solo error
36 |         with solo:
37 |             result = await try_catch(Config.db_backup_function)
38 |         logger.info(f'Backup DB finished: {result!r}')
39 | 


--------------------------------------------------------------------------------
/watchdogs/static/css/watchdogs.css:
--------------------------------------------------------------------------------
 1 | .full-screen,
 2 | body,
 3 | .el-tabs__content,
 4 | .el-tabs__content > * {
 5 |     width: 100%;
 6 |     height: 100%;
 7 | }
 8 | 
 9 | html {
10 |     margin: 0 auto;
11 |     zoom: 90%;
12 |     width: 90%;
13 |     height: 90%;
14 | }
15 | 
16 | .el-tabs__item {
17 |     font-weight: bold;
18 | }
19 | 
20 | .el-message-box--center {
21 |     min-width: 50%;
22 | }
23 | 
24 | .el-message-box {
25 |     width: auto;
26 | }
27 | 
28 | .time-td {
29 |     min-width: 16em;
30 |     padding-left: 3em;
31 | }
32 | 
33 | #input_host_form > .el-form-item:first-child .el-form-item__content,
34 | #input_host_form > .el-form-item:first-child {
35 |     width: 100%;
36 | }
37 | 
38 | [aria-label='Edit Crawler JSON'] .el-textarea__inner {
39 |     height: 10em;
40 | }
41 | 
42 | .el-table_1_column_8 > .cell {
43 |     white-space: nowrap;
44 | }
45 | 
46 | div.foot {
47 |     display: flex;
48 |     justify-content: center;
49 | }
50 | 
51 | .host-tag {
52 |     margin: 0.5em;
53 |     cursor: pointer;
54 | }
55 | 
56 | .el-table .warning-row {
57 |     background: oldlace;
58 | }
59 | 
60 | .cb_name {
61 |     cursor: pointer;
62 |     padding-left: 1em;
63 | }
64 | 
65 | p.custom_links {
66 |     text-align: center;
67 |     color: black;
68 |     background-color: rgba(223, 223, 223, 0.5);
69 |     padding: 0.5em 0 0.5em 0;
70 |     box-shadow: 3px 3px 5px #888888;
71 | }
72 | 
73 | .request_args_pre {
74 |     font-size: 0.9em;
75 | }
76 | 
77 | [v-cloak] {
78 |     display: none;
79 | }
80 | .el-popover {
81 |     max-width: 50%;
82 | }
83 | .el-message-box.work_hours_doc{
84 |     width: 40%;
85 | }
86 | pre {
87 |     word-wrap: break-word;
88 |     white-space: pre-wrap;
89 | }
90 | 


--------------------------------------------------------------------------------
/watchdogs/templates/auth.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 | <head>
 5 |     <meta charset="utf-8" />
 6 |     <meta name="referrer" content="never">
 7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 8 |     <link rel="shortcut icon" href="/static/img/favicon.svg" />
 9 |     <title>{{action}} Watchdogs v{{version}}</title>
10 |     <meta name="viewport" content="width=device-width, initial-scale=1">
11 |     <style>
12 |         html,
13 |         body,
14 |         button {
15 |             width: 100%;
16 |             height: 100%;
17 |         }
18 |     </style>
19 | </head>
20 | 
21 | <body>
22 |     <button id="login" onclick="prompt_password()">
23 |         <h1>Login</h1>
24 |     </button>
25 |     <script>
26 |         async function prompt_password() {
27 |             var pwd = prompt("{{prompt_title}}", "");
28 |             if (pwd != null && pwd != "") {
29 |                 let redirect = window.location.href.match(/(&|\?)redirect=[^&]*/) || ''
30 |                 if (redirect) {
31 |                     redirect = '?' + redirect[0].slice(1)
32 |                 }
33 |                 let api = '/auth' + redirect
34 |                 let response = await fetch(api, {
35 |                     method: 'POST',
36 |                     headers: {
37 |                         'Content-Type': 'application/json'
38 |                     },
39 |                     body: JSON.stringify({
40 |                         'password': pwd
41 |                     })
42 |                 });
43 |                 let result = await response.json();
44 |                 if (result.ok) {
45 |                     document.getElementById('login').remove()
46 |                     window.location.href = result.redirect
47 |                 } else {
48 |                     alert('invalid password.')
49 |                 }
50 |             }
51 |         }
52 |         prompt_password()
53 |     </script>
54 | </body>
55 | 
56 | </html>
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | tmp.py
131 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import re
 4 | import sys
 5 | 
 6 | from setuptools import find_packages, setup
 7 | """
 8 | linux:
 9 | rm -rf "dist/*";rm -rf "build/*";python3 setup.py bdist_wheel;twine upload "dist/*;rm -rf "dist/*";rm -rf "build/*""
10 | win32:
11 | rm -r -Force dist;rm -r -Force build;python3 setup.py bdist_wheel;twine upload "dist/*";rm -r -Force dist;rm -r -Force build;rm -r -Force watchdogs.egg-info
12 | """
13 | 
14 | py_version = sys.version_info
15 | if py_version.major < 3 or py_version.minor < 6:
16 |     raise RuntimeError('Only support python3.6+')
17 | 
18 | with open('requirements.txt') as f:
19 |     install_requires = [line for line in f.read().strip().split('\n')]
20 | 
21 | with open("README.md", encoding="u8") as f:
22 |     long_description = f.read()
23 | 
24 | if not re.search(r'postgresql|mysql|sqlite', str(sys.argv)):
25 |     install_requires.append('aiosqlite')
26 | 
27 | here = os.path.abspath(os.path.dirname(__file__))
28 | with open(os.path.join(here, 'watchdogs', '__init__.py'), encoding="u8") as f:
29 |     matched = re.search(r'''__version__ = ['"](.*?)['"]''', f.read())
30 |     if not matched:
31 |         raise ValueError('Not find the __version__ info.')
32 |     version = matched.group(1)
33 | 
34 | description = "Watchdogs to keep an eye on the world's change. Read more: https://github.com/ClericPy/watchdogs."
35 | 
36 | setup(
37 |     name="watchdogs",
38 |     version=version,
39 |     keywords="requests crawler uniparser torequests fastapi watchdog",
40 |     description=description,
41 |     long_description=long_description,
42 |     long_description_content_type='text/markdown',
43 |     license="MIT License",
44 |     install_requires=install_requires,
45 |     py_modules=["watchdogs"],
46 |     package_data={
47 |         'watchdogs': [
48 |             'templates/*.html', 'static/img/*.*', 'static/js/*.js',
49 |             'static/css/*.css', 'static/css/fonts/*.*'
50 |         ]
51 |     },
52 |     extras_require={
53 |         "postgresql": ["asyncpg", "psycopg2-binary"],
54 |         "mysql": ["aiomysql", "pymysql"],
55 |         "sqlite": ["aiosqlite"]
56 |     },
57 |     classifiers=[
58 |         "License :: OSI Approved :: MIT License",
59 |         'Programming Language :: Python',
60 |         "Programming Language :: Python :: 3",
61 |         "Programming Language :: Python :: 3.6",
62 |         "Programming Language :: Python :: 3.7",
63 |         "Programming Language :: Python :: 3.8",
64 |     ],
65 |     author="ClericPy",
66 |     author_email="clericpy@gmail.com",
67 |     url="https://github.com/ClericPy/watchdogs",
68 |     packages=find_packages(),
69 |     platforms="any",
70 | )
71 | 


--------------------------------------------------------------------------------
/watchdogs/templates/logs.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8" />
  5 |     <meta name="referrer" content="never" />
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  7 |     <link
  8 |       rel="shortcut icon"
  9 |       href="/static/img/favicon.svg"
 10 |       type="image/icon"
 11 |     />
 12 |     <title>Watchdogs logs</title>
 13 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 14 |     <meta http-equiv="refresh" content="{{refresh_every or ''}};" />
 15 | 
 16 |     <style>
 17 |       a {
 18 |         color: black;
 19 |       }
 20 | 
 21 |       body {
 22 |         background-color: #fafafa;
 23 |         padding: 1em;
 24 |         width: 90%;
 25 |         margin: 0 auto;
 26 |       }
 27 | 
 28 |       pre {
 29 |         background-color: #eceff1;
 30 |         padding: 1em;
 31 |         word-wrap: break-word;
 32 |         white-space: pre-wrap;
 33 |       }
 34 | 
 35 |       p {
 36 |         font-size: 0.8em;
 37 |       }
 38 | 
 39 |       input,
 40 |       button {
 41 |         outline-style: none;
 42 |         border: 1px solid #ccc;
 43 |         border-radius: 3px;
 44 |       }
 45 | 
 46 |       a.clear_log > button {
 47 |         font-size: 0.3em;
 48 |         color: black;
 49 |       }
 50 |       .log_metas {
 51 |         font-size: 0.8em;
 52 |         color: gray;
 53 |         margin-left: 2em;
 54 |       }
 55 |     </style>
 56 |   </head>
 57 | 
 58 |   <body>
 59 |     <form style="font-size: 0.8em;">
 60 |       max_lines:
 61 |       <input
 62 |         type="text"
 63 |         name="max_lines"
 64 |         onClick="this.select();"
 65 |         value="{{max_lines}}"
 66 |       />
 67 |       refresh_every:
 68 |       <input
 69 |         type="text"
 70 |         name="refresh_every"
 71 |         onClick="this.select();"
 72 |         value="{{refresh_every}}"
 73 |       />
 74 |       log_names:
 75 |       <input
 76 |         type="text"
 77 |         name="log_names"
 78 |         onClick="this.select();"
 79 |         value="{{log_names}}"
 80 |       />
 81 |       <input type="submit" value="Submit" />
 82 |     </form>
 83 |     {% for item in items %}
 84 |     <hr />
 85 |     <h5>
 86 |       <a href="?log_names={{item['name']}}">{{item['name']}}.log</a>
 87 |       <span class="log_metas"
 88 |         >{{item['line_no']}} lines ( {{item['file_size']}} /
 89 |         {{item['file_size_mb']}} MB ), st_mtime: {{item['st_mtime']}}</span
 90 |       >
 91 |       <a
 92 |         href="/log.clear?log_names={{item['name']}}&current_names={{log_names}}"
 93 |         class="clear_log"
 94 |       >
 95 |         <button>Clear</button></a
 96 |       >
 97 |     </h5>
 98 |     <hr />
 99 |     <pre><code>{{item['log_text']}}</code></pre>
100 |     {% endfor %}
101 |   </body>
102 | </html>
103 | 


--------------------------------------------------------------------------------
/watchdogs/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from fire import Fire
  4 | from uvicorn import run
  5 | 
  6 | from .config import NotSet, ensure_dir
  7 | from .settings import Config, get_valid_value, setup
  8 | 
  9 | 
 10 | def clear_dir(dir_path):
 11 |     if not dir_path.is_dir():
 12 |         print(f'Dir is not exist: {dir_path}.')
 13 |         return True
 14 |     print(f'Cleaning {dir_path}...')
 15 |     for f in dir_path.iterdir():
 16 |         if f.is_dir():
 17 |             clear_dir(f)
 18 |         else:
 19 |             f.unlink()
 20 |             print(f'File removed: {f}')
 21 |     dir_path.rmdir()
 22 |     print(f'Folder removed: {dir_path}')
 23 | 
 24 | 
 25 | def init_app(db_url=None,
 26 |              password=None,
 27 |              uninstall=False,
 28 |              mute_std_log=NotSet,
 29 |              mute_file_log=NotSet,
 30 |              md5_salt=None,
 31 |              config_dir=None,
 32 |              use_default_cdn=False,
 33 |              allow_new_request=False,
 34 |              **uvicorn_kwargs):
 35 |     if config_dir:
 36 |         Config.CONFIG_DIR = ensure_dir(config_dir)
 37 |     if uninstall:
 38 |         clear_dir(Config.CONFIG_DIR)
 39 |         sys.exit('Config dir cleared.')
 40 |     if allow_new_request:
 41 |         # will allow use requests / aiohttp / tPool / Requests in UDFParser
 42 |         import aiohttp
 43 |         import requests
 44 |         from torequests.dummy import Requests
 45 |         from torequests.main import tPool
 46 |         from uniparser.parsers import UDFParser
 47 | 
 48 |         UDFParser._GLOBALS_ARGS.update(aiohttp=aiohttp,
 49 |                                        requests=requests,
 50 |                                        Requests=Requests,
 51 |                                        tPool=tPool)
 52 |     # backward compatibility for ignore_stdout_log & ignore_file_log
 53 |     Config.mute_std_log = get_valid_value(
 54 |         [uvicorn_kwargs.pop('ignore_stdout_log', NotSet), mute_std_log],
 55 |         Config.mute_std_log)
 56 |     Config.mute_file_log = get_valid_value(
 57 |         [uvicorn_kwargs.pop('ignore_file_log', NotSet), mute_file_log],
 58 |         Config.mute_file_log)
 59 |     # update by given uvicorn_kwargs
 60 |     Config.uvicorn_kwargs.update(uvicorn_kwargs)
 61 |     if db_url:
 62 |         # update by given db_url
 63 |         Config.db_url = db_url
 64 |     Config.password = password
 65 |     Config.md5_salt = md5_salt or ''
 66 |     setup(use_default_cdn=use_default_cdn)
 67 |     from .app import app
 68 |     return app
 69 | 
 70 | 
 71 | def start_app(db_url=None,
 72 |               password=None,
 73 |               uninstall=False,
 74 |               mute_std_log=NotSet,
 75 |               mute_file_log=NotSet,
 76 |               md5_salt=None,
 77 |               config_dir=None,
 78 |               use_default_cdn=False,
 79 |               allow_new_request=False,
 80 |               **uvicorn_kwargs):
 81 |     app = init_app(db_url=db_url,
 82 |                    password=password,
 83 |                    uninstall=uninstall,
 84 |                    mute_std_log=mute_std_log,
 85 |                    mute_file_log=mute_file_log,
 86 |                    md5_salt=md5_salt,
 87 |                    config_dir=config_dir,
 88 |                    use_default_cdn=use_default_cdn,
 89 |                    allow_new_request=allow_new_request,
 90 |                    **uvicorn_kwargs)
 91 | 
 92 |     run(app, **Config.uvicorn_kwargs)
 93 | 
 94 | 
 95 | def main():
 96 |     argv = sys.argv
 97 |     if ('-h' in argv or '--help' in argv) and '--' not in argv:
 98 |         print(
 99 |             '"-h" and "--help" should be after "--", examples:\n > python -m watchdogs -- -h\n > python run_server.py -- -h'
100 |         )
101 |         return
102 |     Fire(start_app)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/quick_start.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- prefix https://github.com/ClericPy/watchdogs/raw/master/ -->
 3 | 
 4 | # Given a mission
 5 |     get the most popular repository in the github python trending page.
 6 |    1. Here crawl and parse the HTML from https://github.com/trending/python?since=daily
 7 |    2. ~~Although you can get it from the api.github.com~~
 8 | 
 9 | # Create a CrawlerRule
10 | 
11 | 1. Get the request args
12 |    1. Use the URL: https://github.com/trending/python?since=daily
13 |    2. Or copy the curl-string from chrome
14 |       1. Chrome dev tools -> Network -> url (RClick) -> Copy -> Copy as cURL
15 |       2. some scenes need the cookie authentication or headers anti-crawler.
16 |       3. ![Copy cURL](https://github.com/ClericPy/watchdogs/raw/master/images/d1.png)
17 | 2. Create crawler rule
18 |    1. Open watchdog page. Default http://127.0.0.1:9901/
19 |    2. Click \<New\> tab.
20 |    3. First step is to set the CrawlerRule's meta info.
21 |       1. Now start to ensure the request is correct.
22 |       2. Click \<cURL Parse\> link.
23 |       3. Input the cURL string or URL got before.
24 |       4. ![](https://github.com/ClericPy/watchdogs/raw/master/images/d2.png)
25 |       5. Then it generates the default regex & request args, maybe need some change for match more url pattern.
26 |       6. Click \<Download\> button, wait for it finish downloading => Response Body [200]
27 |          1. If after downloading, \<Rule Name\> is still null, need to input manually.
28 |       7. Check the source code downloaded, ensure it is what you want.
29 |          1. Also you can check it in the parse rules by using a rule named `__schema__`, the parser will raise Error except this `__schema__` rule returns `True`.
30 |    4.  Now to set the ParseRules of this CrawlerRule.
31 |        1.  A valid CrawlerRule should contains `text` rule and `url` rule, and the `url` rule is optional.
32 |        2.  Delete the existing text rule, create a new parse rule named `list`.
33 |        3.  Create a new Parse Rule like as below: ![](https://github.com/ClericPy/watchdogs/raw/master/images/d3.png)
34 |            1.  Here we got the list item for child rules.
35 |        4.  Then need two child rules named `text` and `url` for the `list` rule.
36 |        5.  Create a new parse rule named `text`  like this: ![](https://github.com/ClericPy/watchdogs/raw/master/images/d4.png)
37 |            1.  Click the button send the `text` rule to `list` rule.
38 |        6.  Create a new parse rule named `url` like `text`, or ignore this rule. But `$text` attribute should use `@href` for get href attribute. Also need to send this rule to `list` rule.
39 |    5.  OK, now click `Parse` button to parse this CrawlerRule, and get the result.
40 |    6.  Click the \<1. Save Crawler Rule\> button to save rule into database.
41 | 
42 | > Parse result
43 | 
44 | ```javascript
45 | {'Trending Python repositories on GitHub today · GitHub': {'list': {'text': 'gwen001 /      pentest-tools', 'url': 'https://github.com/gwen001/pentest-tools'}}}
46 | ```
47 | 
48 | >  CrawlerRule JSON. This JSON string can be loaded by clicking the \<Loads\> button.
49 | 
50 | ```javascript
51 | {"name":"Trending Python repositories on GitHub today · GitHub","request_args":{"method":"get","url":"https://github.com/trending/python?since=daily","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"list","chain_rules":[["css","h1.lh-condensed>a","$string"],["python","index","0"],["re","=\"/","@=\"https://github.com/"]],"child_rules":[{"name":"text","chain_rules":[["css","a","$text"],["py","index","0"],["udf","input_object.strip().replace('\\n', '')",""]],"child_rules":[],"iter_parse_child":false},{"name":"url","chain_rules":[["css","a","@href"],["python","index","0"]],"child_rules":[],"iter_parse_child":false}],"iter_parse_child":false}],"regex":"^https://github\\.com/trending/python\\?since=daily$","encoding":""}
52 | ```
53 | 
54 | # Create a Task
55 | 
56 | 1. Click the \<2. Add New Task\> button.
57 | 2. Ensure the task info. ![](https://github.com/ClericPy/watchdogs/raw/master/images/d5.png)
58 | 3. Click \<Submit\> button. Create task success.
59 | 
60 | # Update a Task
61 | 
62 | 1. Click \<Tasks\> tab.
63 | 2. Double click the task's row.
64 | 3. Update it, submit.
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [watchdogs](https://github.com/ClericPy/watchdogs) [![PyPI](https://img.shields.io/pypi/v/watchdogs?style=plastic)](https://pypi.org/project/watchdogs/)![PyPI - Wheel](https://img.shields.io/pypi/wheel/watchdogs?style=plastic)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/watchdogs?style=plastic)![PyPI - Downloads](https://img.shields.io/pypi/dm/watchdogs?style=plastic)![PyPI - License](https://img.shields.io/pypi/l/watchdogs?style=plastic)
 2 | 
 3 | Keep an eye on the change of web world.
 4 | 
 5 | Such as `post articles` / `news on the web portal` / `server api health` / `binge-watching` / `steam price fluctuation` / `github events` / `updates of comic and novel`, and so on...
 6 | 
 7 | ## Intro
 8 | 
 9 | > [中文文档](https://clericpy.github.io/blog/posts/20200331171211/)
10 | 
11 | 1. This is a web app based on [fastapi](https://github.com/tiangolo/fastapi), [databases](https://github.com/encode/databases), [uniparser](https://github.com/ClericPy/uniparser), [torequests](https://github.com/ClericPy/torequests).
12 | 2. Smoothly deploy it by pip: `pip install -U watchdogs;python3 -m watchdogs`
13 | 3. Simple to create a new crawler with the Web UI, not like old ways to write duplicate code.
14 | 4. All the crawlers keep runing in the async environment.
15 | 5. Almost all the elements have a *title* attribute to describe the features in the Web UI, which means docs lay on the UI.
16 | 6. Release your hands from repetitive refreshing pages on the browser.
17 |     1. Subscribe the change events with RSS reminder extensions, such as [Feedbro](https://chrome.google.com/webstore/detail/feedbro/mefgmmbdailogpfhfblcnnjfmnpnmdfa) or RSS Feed Reader.
18 |     2. Implement a class which inherits from `watchdogs.callbacks.Callback`.
19 | 
20 | ## Usage
21 | 
22 | 1. > pip install -U watchdogs
23 | 
24 | 2. > python -m watchdogs
25 | 
26 | 3. > Open the browser: http://127.0.0.1:9901
27 | 
28 | ### Command line args
29 | 
30 | > python -m watchdogs -- -h
31 | 
32 | - **db_url**:
33 | > sqlite / mysql / postgresql(not test) url, which [databases](https://github.com/encode/databases) supports. Defaults to 'sqlite:///{HOME_PATH}/watchdogs/storage.sqlite'
34 | - **password**:
35 | > init password, if null can be set on the first visit on web.
36 | - **mute_std_log**:
37 | > remove stdout log for clean stream
38 | - **mute_file_log**:
39 | > ignore file log located at {HOME_PATH}/watchdogs folder.
40 | - **md5_salt**:
41 | > md5_salt for custom md5(password) / md5(rss_tag)
42 | - **config_dir**:
43 | > config dir to save the logs and config files, if using sqlite include sqlite file. defaults to {HOME_PATH}/watchdogs
44 | - **use_default_cdn**:
45 | > If Config.cdn_urls not set, and use_default_cdn is True, will use online js/css cdn links from staticfile.org.
46 | - **\*\*uvicorn_kwargs**:
47 | > uvicorn startup kwargs, such as port, host. Which can be set like: `python -m watchdogs --port=9999 --host=127.0.0.1 --access-log=False`
48 | 
49 | ### Quick Start to Create New Task
50 | 
51 | [Quick Start Screenshots](https://github.com/ClericPy/watchdogs/blob/master/quick_start.md)
52 | 
53 | 
54 | ## Web UI
55 | 
56 | <details>
57 |         <summary>Screenshots</summary>
58 | 
59 | 1. Welcome Page (Tasks Page).
60 | > Here you can see all the tasks meta, goto RSS / Mobile Lite Page, and do some operations to the tasks.
61 | 
62 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/1.png)
63 | 
64 | 2. New Task Page.
65 | > Here based on the latest [uniparser](https://github.com/ClericPy/uniparser) web app, to create new rules and also tasks.
66 | 
67 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/2.png)
68 | 
69 | 3. Rules Page.
70 | > Do some operations for the rules.
71 | 
72 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/3.png)
73 | 
74 | 4. API page.
75 | > Based on [fastapi](https://github.com/tiangolo/fastapi) `/docs` which is generated automatically.
76 | 
77 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/4.png)
78 | 
79 | 5. Mobile Page (Lite View).
80 | > For mobile phone to glimpse the latest result for the current 30 tasks.
81 | 
82 | ![image](https://github.com/ClericPy/watchdogs/raw/master/images/5.png)
83 | 
84 | </details>
85 | 
86 | <!--
87 | 
88 | [CSS Minifier](https://cssminifier.com/)
89 | [JavaScript Minifier](https://javascript-minifier.com/)
90 | 
91 | -->
92 | 


--------------------------------------------------------------------------------
/watchdogs/templates/groups.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="referrer" content="never" />
  7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  8 |     <link rel="shortcut icon" href="/static/img/favicon.svg" type="image/icon" />
  9 |     <title>Watchdogs Groups v{{version}}</title>
 10 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 11 | 
 12 |     <style>
 13 |         html {
 14 |             margin: 3em 10% 3em 10%;
 15 |         }
 16 | 
 17 |         input {
 18 |             min-width: 5em;
 19 |             -webkit-font-smoothing: antialiased;
 20 |             appearance: none;
 21 |             background-color: #FAFBFC;
 22 |             border: 1px solid rgba(27, 31, 35, 0.15);
 23 |             border-radius: 6px;
 24 |             box-shadow: rgba(27, 31, 35, 0.04) 0 1px 0, rgba(255, 255, 255, 0.25) 0 1px 0 inset;
 25 |             box-sizing: border-box;
 26 |             color: #24292E;
 27 |             display: inline-block;
 28 |             font-size: .9em;
 29 |             font-weight: bold;
 30 |             line-height: 1.5em;
 31 |             list-style: none;
 32 |             padding: 6px 16px;
 33 |             position: relative;
 34 |             transition: background-color 0.2s cubic-bezier(0.3, 0, 0.5, 1);
 35 |             touch-action: manipulation;
 36 |             vertical-align: middle;
 37 |             white-space: nowrap;
 38 |             word-wrap: break-word;
 39 |         }
 40 | 
 41 |         a {
 42 |             color: black;
 43 |         }
 44 | 
 45 |         span {
 46 |             font-weight: bold;
 47 |             margin: .5em;
 48 |         }
 49 |     </style>
 50 | </head>
 51 | 
 52 | <body>
 53 |     <div class="main-body">
 54 |         <div>
 55 |             <h3>
 56 |                 <a href="/">Watchdogs</a>
 57 |             </h3>
 58 |             <div style="text-align: right;" id="new_group">
 59 |                 <input style="width: 20%;" type="text" id="new_id" placeholder="id" value="">
 60 |                 <input style="width: 10%;" type="submit" value="New Group"
 61 |                     onclick="send_api(parseInt(document.getElementById('new_id').value)||null,'new')">
 62 |             </div>
 63 |             <hr>
 64 |             <div>
 65 |                 {% for group in groups %}
 66 |                 <div id="{{group.id}}">
 67 |                     <span>id: </span><input style="width: 10%;" type="text" name="id" disabled value="{{group.id}}">
 68 |                     <span>name: </span><input style="width: 15%;" type="text" name="name" value="{{group.name}}">
 69 |                     <span>task_ids: </span><input type="text" name="task_ids" style="width: 30%;"
 70 |                         value="{{group.task_ids}}">
 71 |                     <input style="width: 5%;" type="submit" onclick="update_group({{group.id}}, 'update')">
 72 |                     <input style="width: 5%;" type="submit" onclick="update_group({{group.id}}, 'delete')"
 73 |                         value="Delete">
 74 |                     <a target="_blank" href="{{group.href_lite}}">Lite</a>
 75 |                     <a target="_blank" href="{{group.href_feeds}}">Feed</a>
 76 |                 </div>
 77 |                 {% endfor %}
 78 |             </div>
 79 |         </div>
 80 |     </div>
 81 |     <script>
 82 |         function send_api(group_id, action) {
 83 |             let data = JSON.stringify(get_data(group_id))
 84 |             fetch('/update_group?action=' + action, {
 85 |                 method: "POST",
 86 |                 body: data,
 87 |                 headers: {
 88 |                     "Content-Type": "application/json"
 89 |                 }
 90 |             })
 91 |                 .then(function (response) {
 92 |                     return response.text();
 93 |                 })
 94 |                 .then(function (body) {
 95 |                     alert(body)
 96 |                     window.location.reload()
 97 |                 });
 98 |         }
 99 |         function get_data(group_id) {
100 |             let data = {
101 |                 id: group_id,
102 |                 name: "",
103 |                 task_ids: "",
104 |             }
105 |             if (group_id) {
106 |                 let node = document.getElementById(group_id)
107 |                 if (node) {
108 |                     data.name = node.querySelector('[name="name"]').value
109 |                     data.task_ids = node.querySelector('[name="task_ids"]').value
110 |                 }
111 |             }
112 |             return data
113 |         }
114 |         function update_group(group_id, action) {
115 |             if (confirm(action + '?')) {
116 |                 send_api(group_id, action)
117 |             }
118 |         }
119 |     </script>
120 | </body>
121 | 
122 | </html>
123 | 


--------------------------------------------------------------------------------
/watchdogs/static/img/favicon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!-- Generator: Adobe Illustrator 17.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 5 | 	 viewBox="0 0 512 512" style="enable-background:new 0 0 512 512;" xml:space="preserve">
 6 | <g>
 7 | 	<path d="M344,372c-8.837,0-15.708,5.215-22.983,10.736C312.426,389.257,303.542,396,289.846,396C273.765,396,264,388.221,264,384.8
 8 | 		v-7.573c4.103-1.904,10.192-4.965,16.329-8.914C296.036,358.207,304,347.485,304,336.444C304,313.965,285.607,300,256,300
 9 | 		s-48,13.965-48,36.444c0,11.041,7.964,21.763,23.671,31.869c6.136,3.949,12.226,7.01,16.329,8.914v7.573
10 | 		c0,3.421-9.765,11.2-25.846,11.2c-13.696,0-22.58-6.743-31.171-13.264C183.708,377.215,176.837,372,168,372c-4.418,0-8,3.582-8,8
11 | 		s3.582,8,8,8c3.453,0,8.021,3.467,13.31,7.481c9.185,6.971,21.764,16.519,40.844,16.519c13.9,0,26.232-4.432,33.846-11.232
12 | 		c7.614,6.8,19.946,11.232,33.846,11.232c19.08,0,31.659-9.548,40.844-16.519c5.289-4.015,9.857-7.481,13.31-7.481
13 | 		c4.418,0,8-3.582,8-8S348.418,372,344,372z M224,336.444C224,317.992,246.378,316,256,316c28.882,0,32,14.297,32,20.444
14 | 		c0,8.999-18.124,20.761-32,26.878C242.121,357.202,224,345.442,224,336.444z"/>
15 | 	<path d="M200,228c0-17.645-14.355-32-32-32s-32,14.355-32,32s14.355,32,32,32S200,245.645,200,228z M152,228
16 | 		c0-8.822,7.178-16,16-16s16,7.178,16,16s-7.178,16-16,16S152,236.822,152,228z"/>
17 | 	<path d="M344,196c-17.645,0-32,14.355-32,32s14.355,32,32,32s32-14.355,32-32S361.645,196,344,196z M344,244
18 | 		c-8.822,0-16-7.178-16-16s7.178-16,16-16s16,7.178,16,16S352.822,244,344,244z"/>
19 | 	<path d="M477.344,92.793C445.092,45.853,421.833,36,408,36c-22.488,0-39.703,12.801-46.699,34.442
20 | 		c-10.939,5.138-19.202,3.016-34.479-0.945C311.358,65.489,290.181,60,256,60s-55.358,5.489-70.822,9.497
21 | 		c-15.278,3.961-23.54,6.083-34.479,0.945C143.703,48.801,126.488,36,104,36c-13.833,0-37.092,9.853-69.344,56.793
22 | 		C17.989,117.051,0,151.173,0,164c0,3.643,2.191,5.678,3.952,7.312c4.546,4.22,18.38,17.063,20.067,41.245
23 | 		c1.053,15.094,5.92,31.239,13.355,44.298c8.211,14.423,18.739,23.841,29.646,26.517c1.813,0.445,4.015,0.789,6.493,0.789
24 | 		c2.041,0,4.268-0.233,6.615-0.836c0.92,29.027,7.368,53.792,19.661,75.495c11.833,20.89,29.427,39.812,55.367,59.548
25 | 		c2.092,1.591,5.053,5.542,8.188,9.723C176.78,446.009,199.267,476,256,476s79.22-29.991,92.656-47.91
26 | 		c3.135-4.182,6.097-8.132,8.188-9.723c25.94-19.737,43.533-38.659,55.367-59.548c12.293-21.702,18.741-46.468,19.661-75.495
27 | 		c2.348,0.603,4.575,0.836,6.615,0.836c2.478,0,4.681-0.344,6.493-0.789c23.918-5.87,40.943-41.329,43-70.815
28 | 		c1.688-24.182,15.521-37.024,20.067-41.245c1.761-1.634,3.952-3.669,3.952-7.312C512,151.173,494.011,117.051,477.344,92.793z
29 | 		 M472.02,211.443c-0.892,12.783-5.01,26.449-11.299,37.495c-5.928,10.413-13.055,17.299-19.554,18.894
30 | 		c-6.319,1.553-12.495-2.492-18.351-12.021c-8.44-13.735-3.108-43.257,2.048-71.806c7.53-41.689,16.064-88.94-11.49-113.932
31 | 		c-3.273-2.968-8.332-2.721-11.3,0.551c-2.968,3.273-2.721,8.332,0.551,11.3c20.98,19.029,13.616,59.804,6.494,99.236
32 | 		c-5.952,32.956-11.575,64.084,0.065,83.026c2.341,3.81,4.748,6.924,7.166,9.471C416.124,274.4,416,275.186,416,276
33 | 		c0,64.115-27.521,98.192-68.844,129.633c-3.867,2.942-7.312,7.538-11.302,12.858C323.461,435.021,304.732,460,256,460
34 | 		s-67.461-24.979-79.854-41.509c-3.99-5.32-7.435-9.916-11.302-12.858C123.521,374.192,96,340.115,96,276
35 | 		c0-0.815-0.124-1.601-0.35-2.341c2.418-2.547,4.825-5.661,7.166-9.471c11.64-18.942,6.017-50.071,0.065-83.026
36 | 		c-7.122-39.433-14.487-80.208,6.494-99.236c3.272-2.968,3.52-8.027,0.551-11.3s-8.028-3.519-11.3-0.551
37 | 		c-27.554,24.991-19.02,72.242-11.49,113.932c5.156,28.549,10.488,58.07,2.048,71.806c-5.855,9.528-12.029,13.575-18.351,12.021
38 | 		c-6.5-1.595-13.626-8.481-19.554-18.894c-6.289-11.046-10.407-24.712-11.299-37.495c-1.36-19.492-9.262-36.399-23.493-50.288
39 | 		c2.36-9.029,13.237-32.931,31.356-59.302C69.292,70.637,90.285,52,104,52c9.513,0,26.353,3.38,32.259,26.02
40 | 		c0.007,0.026,0.017,0.051,0.024,0.077c0.04,0.147,0.089,0.292,0.138,0.437c0.036,0.108,0.069,0.219,0.109,0.325
41 | 		c0.048,0.126,0.106,0.249,0.161,0.374c0.052,0.117,0.1,0.237,0.156,0.351c0.054,0.107,0.116,0.209,0.175,0.314
42 | 		c0.068,0.123,0.133,0.247,0.207,0.365c0.064,0.103,0.138,0.2,0.208,0.3c0.078,0.112,0.152,0.227,0.235,0.334
43 | 		c0.084,0.109,0.178,0.211,0.269,0.316c0.078,0.09,0.152,0.184,0.233,0.271c0.098,0.104,0.205,0.201,0.309,0.3
44 | 		c0.084,0.08,0.165,0.164,0.252,0.24c0.099,0.086,0.206,0.166,0.31,0.248c0.103,0.081,0.204,0.165,0.311,0.241
45 | 		c0.096,0.068,0.198,0.13,0.298,0.194c0.124,0.081,0.248,0.162,0.376,0.235c0.025,0.014,0.047,0.031,0.071,0.044
46 | 		c8.594,4.797,16.055,6.537,23.466,6.537c8.125,0,16.191-2.091,25.625-4.537C204.637,80.982,223.859,76,256,76
47 | 		s51.363,4.982,66.807,8.985c9.436,2.446,17.5,4.537,25.625,4.537c7.411,0,14.873-1.74,23.466-6.537
48 | 		c0.024-0.013,0.045-0.029,0.069-0.043c0.134-0.076,0.262-0.161,0.392-0.245c0.094-0.061,0.192-0.12,0.282-0.184
49 | 		c0.114-0.082,0.222-0.17,0.332-0.258c0.097-0.077,0.197-0.15,0.289-0.23c0.097-0.085,0.188-0.178,0.281-0.267
50 | 		c0.094-0.091,0.192-0.178,0.281-0.272c0.093-0.099,0.177-0.205,0.266-0.309c0.079-0.093,0.162-0.183,0.237-0.279
51 | 		c0.092-0.118,0.174-0.244,0.259-0.368c0.061-0.089,0.127-0.175,0.185-0.267c0.079-0.125,0.148-0.257,0.22-0.388
52 | 		c0.054-0.097,0.112-0.192,0.162-0.292c0.059-0.117,0.108-0.241,0.161-0.362c0.053-0.121,0.11-0.241,0.157-0.364
53 | 		c0.041-0.107,0.073-0.218,0.11-0.327c0.048-0.145,0.097-0.289,0.137-0.435c0.007-0.026,0.018-0.051,0.024-0.077
54 | 		C381.647,55.38,398.487,52,408,52c13.715,0,34.708,18.637,56.156,49.854c18.119,26.371,28.996,50.273,31.356,59.302
55 | 		C481.281,175.044,473.379,191.951,472.02,211.443z"/>
56 | 	<path d="M278.605,418.042c-15,2.618-30.211,2.618-45.211,0c-4.354-0.761-8.497,2.153-9.256,6.506
57 | 		c-0.76,4.353,2.153,8.497,6.505,9.256c8.413,1.469,16.885,2.203,25.356,2.203s16.944-0.734,25.356-2.203
58 | 		c4.353-0.759,7.265-4.903,6.505-9.256S282.961,417.281,278.605,418.042z"/>
59 | </g>
60 | <g>
61 | </g>
62 | <g>
63 | </g>
64 | <g>
65 | </g>
66 | <g>
67 | </g>
68 | <g>
69 | </g>
70 | <g>
71 | </g>
72 | <g>
73 | </g>
74 | <g>
75 | </g>
76 | <g>
77 | </g>
78 | <g>
79 | </g>
80 | <g>
81 | </g>
82 | <g>
83 | </g>
84 | <g>
85 | </g>
86 | <g>
87 | </g>
88 | <g>
89 | </g>
90 | </svg>
91 | 


--------------------------------------------------------------------------------
/watchdogs/templates/feeds.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="referrer" content="never" />
  7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  8 |     <link rel="shortcut icon" href="/static/img/favicon.svg" type="image/icon" />
  9 |     <title>Watchdogs Timeline v{{version}}</title>
 10 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 11 | 
 12 |     <style>
 13 |         a[href^="http"]:empty::before {
 14 |             content: attr(href);
 15 |         }
 16 | 
 17 |         html,
 18 |         body {
 19 |             margin: 0 0 0 0;
 20 |             background-color: #f4f8fb;
 21 |         }
 22 | 
 23 |         input {
 24 |             border-color: #ecf0f1;
 25 |         }
 26 | 
 27 |         a {
 28 |             color: #2e3135;
 29 |             text-decoration: none;
 30 |         }
 31 | 
 32 |         #params,
 33 |         #params-toggle {
 34 |             display: none;
 35 |         }
 36 | 
 37 |         div.main-body {
 38 |             width: 55%;
 39 |             margin: 0 auto;
 40 |         }
 41 | 
 42 |         #params-toggle:checked+#params {
 43 |             display: block;
 44 |         }
 45 | 
 46 |         div.tasks-main {
 47 |             display: flex;
 48 |             flex-direction: row-reverse;
 49 |             flex-wrap: nowrap;
 50 |             width: 100%;
 51 |         }
 52 | 
 53 |         div.feeds {
 54 |             width: 100%;
 55 |             display: flex;
 56 |             flex-direction: column;
 57 |             flex-wrap: nowrap;
 58 |             justify-content: flex-start;
 59 |             align-items: stretch;
 60 |             align-content: stretch;
 61 |         }
 62 | 
 63 |         div.feed {
 64 |             background-color: white;
 65 |             margin: 5px 0 5px 0;
 66 |             padding: 0 1.5em 0px 1.5em;
 67 | 
 68 |             border-radius: 2px;
 69 |             box-shadow: 1px 5px 15px rgba(0, 0, 0, 0.1);
 70 |         }
 71 | 
 72 |         div.feed .title {
 73 |             margin-bottom: 0;
 74 |         }
 75 | 
 76 |         div.feed .desc {
 77 |             color: #999;
 78 |             padding-left: 0.2em;
 79 |             font-size: 0.8em;
 80 |             overflow: hidden;
 81 |             text-overflow: ellipsis;
 82 |             display: -webkit-box;
 83 |             -webkit-box-orient: vertical;
 84 |             -webkit-line-clamp: 2;
 85 |         }
 86 | 
 87 |         div.feed .task-time {
 88 |             color: #999;
 89 |             font-size: 0.7em;
 90 |             padding: 8px 0 0 0;
 91 |             border-top: 1px dotted rgba(0, 0, 0, 0.1);
 92 |         }
 93 | 
 94 |         div.head {
 95 |             background-color: #336699;
 96 |             margin: auto;
 97 |             padding: 2px;
 98 |             text-align: center;
 99 |         }
100 | 
101 |         @media (max-width: 900px) {
102 |             div.tasks-main {
103 |                 flex-direction: column;
104 |             }
105 | 
106 |             div.main-body {
107 |                 width: 100%;
108 |                 margin: 0 auto;
109 |             }
110 | 
111 |             div.head {
112 |                 background-color: #018574;
113 |                 margin: auto;
114 |                 padding: 2px;
115 |                 text-align: center;
116 |             }
117 | 
118 |             div.tasks-main {
119 |                 width: 100%;
120 |             }
121 |         }
122 | 
123 |         div.head a {
124 |             color: #ecf0f1;
125 |             text-decoration: none;
126 |             text-shadow: black 8px 8px 8px;
127 |         }
128 | 
129 |         .result-list>ol {
130 |             padding-left: 1em;
131 |             font-size: 0.6em;
132 |         }
133 | 
134 |         .result-list a {
135 |             color: #0065b5;
136 |         }
137 | 
138 |         .pages {
139 |             display: flex;
140 |             text-shadow: black 1px 0px 1px;
141 |         }
142 | 
143 |         .page {
144 |             flex-grow: 1;
145 |             text-align: center;
146 |             background-color: #b1b1b126;
147 |             padding: 1em 0 1em 0;
148 |             border-radius: 6px;
149 |             background-color: #FAFBFC;
150 |             border: 1px solid rgba(27, 31, 35, 0.15);
151 |         }
152 | 
153 |         .rss {
154 |             background-color: #eaedf066;
155 |         }
156 | 
157 |         .date-line {
158 |             text-align: center;
159 |         }
160 | 
161 |         .errors {
162 |             background-color: #ffc6349c;
163 |         }
164 | 
165 |         .errors>p {
166 |             white-space: nowrap;
167 |             text-overflow: ellipsis;
168 |             overflow: hidden;
169 |         }
170 | 
171 |     </style>
172 | </head>
173 | 
174 | <body>
175 |     <div class="main-body">
176 |         <div class="head">
177 |             <h1><a href="{{home_url}}">Watchdogs</a></h1>
178 |         </div>
179 |         <div class="tasks-main">
180 |             <div class="feeds">
181 |                 <div class="errors">
182 |                     {% for task in error_tasks %}
183 |                     <p>[{{task.name}}]: {{task.error}}</p>
184 |                     {% endfor %}
185 |                 </div>
186 |                 {%- if not feeds -%}
187 |                 <p style="text-align: center;">No Feeds.</p>
188 |                 {% endif %}
189 |                 {% for feed in feeds %}
190 |                 {%- if feed.get("name") -%}
191 |                 <div class="feed" id="task-{{feed.task_id}}">
192 |                     <h3 class="title">
193 |                         <a href="{{feed.url}}" target="_blank">{{feed.name}}</a>
194 |                     </h3>
195 |                     <p class="desc">{{feed.text}}</p>
196 |                     <p title="Click to show more" class="task-time">
197 |                         <span class="ts_create">{{feed.ts_create.strftime('%Y-%m-%d %H:%M:%S')}}</span>
198 |                         <!-- <span class="source">{{feed.name}}</span> -->
199 |                         <span class="timeago"> - {{feed.timeago}} ago</span>
200 |                     </p>
201 |                 </div>
202 |                 {% endif %}
203 |                 {%- if feed.get("current_date") -%}
204 |                 <h3 class="date-line">{{feed["current_date"]}}</h3>
205 |                 {% endif %}
206 |                 {% endfor %}
207 |             </div>
208 |         </div>
209 |         <div class="pages">
210 |             {% if last_page_url %}
211 |             <a href="{{last_page_url}}" class="last_page page" target="_self"><b>&lt;</b></a>
212 |             {% endif %}
213 |             <a class="rss page" href="/" target="_blank">Home</a>
214 |             <a class="rss page" href="{{rss_url}}" target="_blank">RSS</a>
215 |             {% if next_page_url %}
216 |             <a href="{{next_page_url}}" class="next_page page" target="_self"><b>&gt;</b></a>
217 |             {% endif %}
218 |         </div>
219 |     </div>
220 |     <script>
221 | 
222 |     </script>
223 | </body>
224 | 
225 | </html>
226 | 


--------------------------------------------------------------------------------
/watchdogs/callbacks.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from json import loads
  3 | from logging import getLogger
  4 | from traceback import format_exc
  5 | from typing import Dict, Type
  6 | 
  7 | from torequests.utils import ttime
  8 | 
  9 | from .utils import ensure_await_result
 10 | 
 11 | 
 12 | class CallbackHandlerBase(ABC):
 13 |     logger = getLogger('watchdogs')
 14 | 
 15 |     def __init__(self):
 16 |         # lazy init object
 17 |         self.callbacks_dict: Dict[str, Type[Callback]] = {}
 18 |         for cls in Callback.__subclasses__():
 19 |             try:
 20 |                 assert cls.name is not None
 21 |                 cls.doc = cls.doc or cls.__doc__
 22 |                 self.callbacks_dict[cls.name] = cls
 23 |             except Exception as err:
 24 |                 self.logger.error(f'{cls} registers failed: {err!r}')
 25 |         self.workers = {cb.name: cb.doc for cb in self.callbacks_dict.values()}
 26 | 
 27 |     @abstractmethod
 28 |     async def callback(self, task):
 29 |         pass
 30 | 
 31 |     def get_callback(self, name):
 32 |         obj = self.callbacks_dict.get(name)
 33 |         if not obj:
 34 |             # not found callback
 35 |             return None
 36 |         if not isinstance(obj, Callback):
 37 |             # here for lazy init
 38 |             obj = obj()
 39 |             self.callbacks_dict[name] = obj
 40 |         return obj
 41 | 
 42 | 
 43 | class CallbackHandler(CallbackHandlerBase):
 44 | 
 45 |     def __init__(self):
 46 |         super().__init__()
 47 | 
 48 |     async def callback(self, task):
 49 |         custom_info: str = task.custom_info.strip()
 50 |         name = custom_info.split(':', 1)[0]
 51 |         cb = self.get_callback(name) or self.get_callback('')
 52 |         if not cb:
 53 |             # not found callback, ignore
 54 |             return
 55 |         try:
 56 |             call_result = await ensure_await_result(cb.callback(task))
 57 |             self.logger.info(
 58 |                 f'{cb.name or "default"} callback({custom_info}) for task {task.name} {call_result}: '
 59 |             )
 60 |         except Exception:
 61 |             self.logger.error(
 62 |                 f'{cb.name or "default"} callback({custom_info}) for task {task.name} error:\n{format_exc()}'
 63 |             )
 64 | 
 65 | 
 66 | class Callback(ABC):
 67 |     """
 68 |     Constraint: Callback object should has this attribute:
 69 |         cls.name: str
 70 |         self.callback(task)
 71 |     if name == '': It's the default callback for null custom info.
 72 |     More common notify middleware is coming.
 73 |     """
 74 |     logger = getLogger('watchdogs')
 75 |     # reset by subclass
 76 |     name: str = None
 77 |     doc = ''
 78 | 
 79 |     @abstractmethod
 80 |     def callback(self, task):
 81 |         """task attributes is new crawled"""
 82 |         pass
 83 | 
 84 | 
 85 | class ServerChanCallback(Callback):
 86 |     """
 87 | Wechat notify toolkit.
 88 | 
 89 |     1. Login with github: http://sc.ftqq.com/
 90 |     2. Click http://sc.ftqq.com/?c=code the SCKEY
 91 |     3. Set the task.custom_info as: server_chan:{SCKEY}
 92 | """
 93 |     name = "server_chan"
 94 | 
 95 |     # doc = 'http://sc.ftqq.com/'
 96 |     TEXT_SLICE_LENGTH = 200
 97 | 
 98 |     def __init__(self):
 99 |         from torequests.dummy import Requests
100 |         self.req = Requests()
101 | 
102 |     async def callback(self, task):
103 |         name, arg = task.custom_info.split(':', 1)
104 |         if not arg:
105 |             raise ValueError(
106 |                 f'{task.name}: custom_info `{task.custom_info}` missing args after `:`'
107 |             )
108 |         latest_result = loads(task.latest_result or '{}')
109 |         text = latest_result.get('text') or ''
110 |         url = latest_result.get('url') or task.origin_url
111 |         title = f'{task.name}#{text[:self.TEXT_SLICE_LENGTH]}'
112 |         body = f'{url}\n\n{text}'
113 |         oks = []
114 |         for key in set(arg.strip().split()):
115 |             if not key or not key.strip():
116 |                 continue
117 |             key = key.strip()
118 |             r = await self.req.post(f'https://sc.ftqq.com/{key}.send',
119 |                                     data={
120 |                                         'text': title,
121 |                                         'desp': body
122 |                                     })
123 |             self.logger.info(f'ServerChanCallback ({key}): {r.text}')
124 |             oks.append((key, bool(r)))
125 |         return f'{len(oks)} sended, {oks}'
126 | 
127 | 
128 | class DingTalkCallback(Callback):
129 |     """
130 | DingDing robot notify toolkit. Will auto check msg type as text / card.
131 | 
132 |     1. Create a group.
133 |     2. Create a robot which contains word ":"
134 |     3. Set the task.custom_info as: dingding:{access_token}
135 | 
136 |     Doc: https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq/e9d991e2
137 | """
138 |     name = "dingding"
139 | 
140 |     def __init__(self):
141 |         from torequests.dummy import Requests
142 |         self.req = Requests()
143 | 
144 |     def make_data(self, task):
145 |         latest_result = loads(task.latest_result or '{}')
146 |         title = latest_result.get('title') or ''
147 |         url = latest_result.get('url') or task.origin_url
148 |         text = latest_result.get('text') or ''
149 |         cover = latest_result.get('cover') or ''
150 |         if cover:
151 |             text = f'![cover]({cover})\n{text}'
152 |         if url or cover:
153 |             # markdown
154 |             title = f'# {task.name}: {title}\n> {ttime()}'
155 |             return {
156 |                 "actionCard": {
157 |                     "title": title,
158 |                     "text": f'{title}\n\n{text}',
159 |                     "singleTitle": "Read More",
160 |                     "singleURL": url
161 |                 },
162 |                 "msgtype": "actionCard"
163 |             }
164 |         return {
165 |             "msgtype": "text",
166 |             "text": {
167 |                 "content": f"{task.name}: {title}\n{text}"
168 |             }
169 |         }
170 | 
171 |     async def callback(self, task):
172 |         name, arg = task.custom_info.split(':', 1)
173 |         if not arg:
174 |             raise ValueError(
175 |                 f'{task.name}: custom_info `{task.custom_info}` missing args after `:`'
176 |             )
177 | 
178 |         data = self.make_data(task)
179 |         oks = []
180 |         for access_token in set(arg.strip().split()):
181 |             if not access_token or not access_token.strip():
182 |                 continue
183 |             access_token = access_token.strip()
184 |             r = await self.req.post(
185 |                 f'https://oapi.dingtalk.com/robot/send?access_token={access_token}',
186 |                 json=data)
187 |             self.logger.info(
188 |                 f'{self.__class__.__name__} ({access_token}): {r.text}')
189 |             oks.append((access_token, bool(r)))
190 |         return f'{len(oks)} sended, {oks}'
191 | 


--------------------------------------------------------------------------------
/watchdogs/templates/lite.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="referrer" content="never" />
  7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  8 |     <link rel="shortcut icon" href="/static/img/favicon.svg" type="image/icon" />
  9 |     <title>Watchdogs Lite v{{version}}</title>
 10 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 11 | 
 12 |     <style>
 13 |         a[href^="http"]:empty::before {
 14 |             content: attr(href);
 15 |         }
 16 | 
 17 |         html,
 18 |         body {
 19 |             margin: 0 0 0 0;
 20 |             background-color: #f4f8fb;
 21 |         }
 22 | 
 23 |         input {
 24 |             border-color: #ecf0f1;
 25 |         }
 26 | 
 27 |         a {
 28 |             color: #2e3135;
 29 |             text-decoration: none;
 30 |         }
 31 | 
 32 |         #params,
 33 |         #params-toggle {
 34 |             display: none;
 35 |         }
 36 | 
 37 |         div.main-body {
 38 |             width: 55%;
 39 |             margin: 0 auto;
 40 |         }
 41 | 
 42 |         #params-toggle:checked+#params {
 43 |             display: block;
 44 |         }
 45 | 
 46 |         div.tasks-main {
 47 |             display: flex;
 48 |             flex-direction: row-reverse;
 49 |             flex-wrap: nowrap;
 50 |             width: 100%;
 51 |         }
 52 | 
 53 |         div.tasks {
 54 |             width: 100%;
 55 |             display: flex;
 56 |             flex-direction: column;
 57 |             flex-wrap: nowrap;
 58 |             justify-content: flex-start;
 59 |             align-items: stretch;
 60 |             align-content: stretch;
 61 |         }
 62 | 
 63 |         div.task {
 64 |             background-color: white;
 65 |             margin: 5px 0 5px 0;
 66 |             padding: 0 1.5em 0px 1.5em;
 67 | 
 68 |             border-radius: 2px;
 69 |             box-shadow: 1px 5px 15px rgba(0, 0, 0, 0.1);
 70 |         }
 71 | 
 72 |         div.task .title {
 73 |             margin-bottom: 0;
 74 |         }
 75 | 
 76 |         div.task .desc {
 77 |             color: #999;
 78 |             padding-left: 0.2em;
 79 |             font-size: 0.8em;
 80 |             overflow: hidden;
 81 |             text-overflow: ellipsis;
 82 |             display: -webkit-box;
 83 |             -webkit-box-orient: vertical;
 84 |             -webkit-line-clamp: 2;
 85 |         }
 86 | 
 87 |         div.task .task-time {
 88 |             color: #999;
 89 |             font-size: 0.7em;
 90 |             padding: 8px 0 0 0;
 91 |             border-top: 1px dotted rgba(0, 0, 0, 0.1);
 92 |         }
 93 | 
 94 |         div.head {
 95 |             background-color: #336699;
 96 |             margin: auto;
 97 |             padding: 2px;
 98 |             text-align: center;
 99 |         }
100 | 
101 |         @media (max-width: 900px) {
102 |             div.tasks-main {
103 |                 flex-direction: column;
104 |             }
105 | 
106 |             div.main-body {
107 |                 width: 100%;
108 |                 margin: 0 auto;
109 |             }
110 | 
111 |             div.head {
112 |                 background-color: #018574;
113 |                 margin: auto;
114 |                 padding: 2px;
115 |                 text-align: center;
116 |             }
117 | 
118 |             div.tasks-main {
119 |                 width: 100%;
120 |             }
121 |         }
122 | 
123 |         div.head a {
124 |             color: #ecf0f1;
125 |             text-decoration: none;
126 |             text-shadow: black 8px 8px 8px;
127 |         }
128 | 
129 |         .view-more {
130 |             font-size: 0.6em;
131 |         }
132 | 
133 |         .result-list>ol {
134 |             padding-left: 1em;
135 |             font-size: 0.6em;
136 |         }
137 | 
138 |         .result-list a {
139 |             color: #0065b5;
140 |         }
141 | 
142 |         .error {
143 |             color: #ebb563;
144 |             font-size: 0.5em;
145 |             overflow-wrap: break-word;
146 |         }
147 | 
148 |         .task.error-task {
149 |             border: 1px solid #ebb563;
150 |         }
151 | 
152 |         .pages {
153 |             display: flex;
154 |             text-shadow: black 1px 0px 1px;
155 |         }
156 | 
157 |         .page {
158 |             flex-grow: 1;
159 |             text-align: center;
160 |             background-color: #b1b1b126;
161 |             padding: 1em 0 1em 0;
162 |         }
163 | 
164 |         .rss {
165 |             background-color: #eaedf066;
166 |         }
167 |     </style>
168 | </head>
169 | 
170 | <body>
171 |     <div class="main-body">
172 |         <div class="head">
173 |             <h1><a href="{{home_url}}">Watchdogs</a></h1>
174 |         </div>
175 |         <div class="tasks-main">
176 |             <div class="tasks">
177 |                 {%- if not tasks -%}
178 |                 <p style="text-align: center;">No Tasks.</p>
179 |                 {% endif %} {% for task in tasks %}
180 |                 <div class="task{{' error-task' if task.error else ''}}" id="task-{{task.task_id}}">
181 |                     <h3 class="title">
182 |                         <a href="{{task.url}}" target="_blank">{{task.name}}</a>
183 |                     </h3>
184 |                     <p class="desc">{{task.text}}</p>
185 |                     <p title="Click to show more" class="task-time" onclick="show_more({{task.task_id}})">
186 |                         <span>{{ '✔' if task.enable else '✖'}}</span>
187 |                         <span class="ts_create">{{task.last_change_time.strftime('%Y-%m-%d %H:%M:%S')}}</span>
188 |                         <!-- <span class="source">{{task.name}}</span> -->
189 |                         <span class="timeago"> - {{task.timeago}} ago</span>
190 |                         <span class="emoji">📂</span>
191 |                         <span class="view-more">View More</span>
192 |                         {%- if task.error -%}
193 |                         <br />
194 |                         <span class="error">
195 |                             Error: {{task.error}}
196 |                         </span>
197 |                         {% endif %}
198 |                     </p>
199 |                 </div>
200 |                 {% endfor %}
201 |             </div>
202 |         </div>
203 |         <div class="pages">
204 |             {% if last_page_url %}
205 |             <a href="{{last_page_url}}" class="last_page page" target="_self"><b>&lt;</b></a>
206 |             {% endif %}
207 |             <a class="rss page" href="/" target="_blank">Home</a>
208 |             <a class="rss page" href="{{rss_url}}" target="_blank">RSS</a>
209 |             {% if next_page_url %}
210 |             <a href="{{next_page_url}}" class="next_page page" target="_self"><b>&gt;</b></a>
211 |             {% endif %}
212 |         </div>
213 |     </div>
214 |     <script>
215 |         async function show_more(task_id) {
216 |             let node = document.querySelector("#task-" + task_id + ">.result-list");
217 |             if (!node) {
218 |                 node = document.createElement("p");
219 |                 node.className = "result-list";
220 |                 document.querySelector("#task-" + task_id).appendChild(node);
221 |             }
222 |             if (node.innerHTML) {
223 |                 node.innerHTML = "";
224 |                 return;
225 |             }
226 |             let response = await fetch(document.location, {
227 |                 method: "POST",
228 |                 headers: {
229 |                     "Content-Type": "application/json",
230 |                 },
231 |                 body: JSON.stringify({
232 |                     task_id: task_id,
233 |                 }),
234 |             });
235 |             if (response.ok) {
236 |                 let result_list = (await response.json()).result_list;
237 |                 let html = "<ol>";
238 |                 result_list.forEach((item) => {
239 |                     let text = item.result.title || item.result.text || "";
240 |                     let url = item.result.url || "";
241 |                     if (url) {
242 |                         url = 'href="' + url + '"';
243 |                     }
244 |                     let time = item.time || "";
245 |                     let line =
246 |                         "<li><a " +
247 |                         url +
248 |                         ' target="_blank">' +
249 |                         time +
250 |                         " - " +
251 |                         unescape(text) +
252 |                         "</a></li>";
253 |                     html += line;
254 |                 });
255 |                 if (result_list.length == 0) {
256 |                     html += "No results.";
257 |                 }
258 |                 html += "</ol>";
259 |                 node.innerHTML = html;
260 |             } else {
261 |                 alert("HTTP-Error: " + response.status);
262 |             }
263 |         }
264 |     </script>
265 | </body>
266 | 
267 | </html>
268 | 


--------------------------------------------------------------------------------
/watchdogs/config.py:
--------------------------------------------------------------------------------
  1 | from logging import ERROR, INFO, Formatter, getLogger
  2 | from pathlib import Path
  3 | from time import time
  4 | from traceback import format_exc
  5 | from typing import Any, Callable, Dict, List
  6 | 
  7 | from databases import Database
  8 | from fastapi import Request
  9 | from fastapi.middleware.gzip import GZipMiddleware
 10 | from frequency_controller import AsyncFrequency
 11 | from starlette.middleware.base import BaseHTTPMiddleware
 12 | from starlette.responses import JSONResponse, RedirectResponse
 13 | from torequests.utils import md5 as _md5
 14 | from torequests.utils import parse_qsl, quote_plus, unparse_qsl
 15 | from uniparser.crawler import RuleStorage
 16 | 
 17 | from .callbacks import CallbackHandlerBase
 18 | 
 19 | logger = getLogger('watchdogs')
 20 | logger.setLevel(INFO)
 21 | 
 22 | NotSet = object()
 23 | 
 24 | 
 25 | # @app.exception_handler(Exception)
 26 | async def exception_handler(request: Request, exc: Exception):
 27 |     trace_id = str(int(time() * 1000))
 28 |     err_name = exc.__class__.__name__
 29 |     err_value = str(exc)
 30 |     msg = f'{err_name}({err_value}) trace_id: {trace_id}:\n{format_exc()}'
 31 |     logger.error(msg)
 32 |     return JSONResponse(
 33 |         status_code=500,
 34 |         content={
 35 |             "message": f"Oops! {err_name}.",
 36 |             "trace_id": trace_id
 37 |         },
 38 |     )
 39 | 
 40 | 
 41 | def ensure_dir(path: Path):
 42 |     if isinstance(path, str):
 43 |         path = Path(path)
 44 |     if path.is_dir():
 45 |         return path
 46 |     else:
 47 |         paths = list(reversed(path.parents))
 48 |         paths.append(path)
 49 |         p: Path
 50 |         for p in paths:
 51 |             if not p.is_dir():
 52 |                 p.mkdir()
 53 |         return path
 54 | 
 55 | 
 56 | def get_sign(path, query):
 57 |     given_sign = ''
 58 |     query_list = []
 59 |     for key, value in parse_qsl(query, keep_blank_values=True):
 60 |         if key == 'sign':
 61 |             given_sign = value
 62 |         else:
 63 |             query_list.append(f'{key}={value}')
 64 |     query_list.sort()
 65 |     valid_sign = md5(f'{path}?{"&".join(query_list)}')
 66 |     return given_sign, valid_sign
 67 | 
 68 | 
 69 | async def auth_checker(request: Request, call_next):
 70 |     # {'type': 'http', 'http_version': '1.1', 'server': ('127.0.0.1', 9901), 'client': ('127.0.0.1', 7037), 'scheme': 'http', 'method': 'GET', 'root_path': '', 'path': '/auth', 'raw_path': b'/auth', 'query_string': b'', 'headers': [(b'host', b'127.0.0.1:9901'), (b'connection', b'keep-alive'), (b'sec-fetch-dest', b'image'), (b'user-agent', b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'), (b'dnt', b'1'), (b'accept', b'image/webp,image/apng,image/*,*/*;q=0.8'), (b'sec-fetch-site', b'same-origin'), (b'sec-fetch-mode', b'no-cors'), (b'referer', b'http://127.0.0.1:9901/auth'), (b'accept-encoding', b'gzip, deflate, br'), (b'accept-language', b'zh-CN,zh;q=0.9'), (b'cookie', b'ads_id=lakdsjflakjdf; _ga=GA1.1.1550108461.1583462251')], 'fastapi_astack': <contextlib.AsyncExitStack object at 0x00000165BE69EEB8>, 'app': <fastapi.applications.FastAPI object at 0x00000165A7B738D0>}
 71 |     path = request.scope['path']
 72 |     if path in Config.AUTH_PATH_WHITE_LIST:
 73 |         # ignore auth check
 74 |         return await call_next(request)
 75 |     query_string = request.scope.get('query_string', b'').decode('u8')
 76 |     query_has_sign = 'sign=' in query_string
 77 |     if query_has_sign:
 78 |         # try checking sign
 79 |         given_sign, valid_sign = Config.get_sign(path, query_string)
 80 |         if given_sign == valid_sign:
 81 |             # sign checking pass
 82 |             return await call_next(request)
 83 |     # try check cookie
 84 |     if not Config.watchdog_auth or Config.watchdog_auth == request.cookies.get(
 85 |             'watchdog_auth', ''):
 86 |         # valid cookie, or no watchdog_auth checker
 87 |         return await call_next(request)
 88 |     # not pass either checker, refused
 89 |     if query_has_sign:
 90 |         # request with sign will not redirect
 91 |         return JSONResponse(
 92 |             status_code=400,
 93 |             content={
 94 |                 "message": 'signature expired',
 95 |             },
 96 |         )
 97 |     else:
 98 |         # bad cookie, reset the watchdog_auth cookie as null
 99 |         resp = RedirectResponse(
100 |             f'/auth?redirect={quote_plus(request.scope["path"])}', 302)
101 |         resp.set_cookie('watchdog_auth', '')
102 |         return resp
103 | 
104 | 
105 | class Config:
106 |     CONFIG_DIR: Path = ensure_dir(Path.home() / 'watchdogs')
107 |     ENCODING = 'utf-8'
108 |     AUTH_PATH_WHITE_LIST = {'/auth'}
109 |     # db_url defaults to sqlite://
110 |     db_url: str = f'sqlite:///{(CONFIG_DIR / "storage.sqlite").as_posix()}'
111 |     db: Database = None
112 |     logger = logger
113 |     password: str = ''
114 |     rule_db: RuleStorage = None
115 |     metas = None
116 |     check_interval: int = 60
117 |     default_interval: int = 5 * 60
118 |     default_crawler_timeout: int = 30
119 |     downloader_timeout: int = 15
120 |     watchdog_auth: str = ''
121 |     md5_salt: str = ''
122 |     crawler = None
123 |     # anti brute force attack
124 |     check_pwd_freq = AsyncFrequency(1, 3)
125 |     # for anti-crawl frequency
126 |     DEFAULT_HOST_FREQUENCY = (1, 1)
127 |     cdn_urls: dict = {}
128 |     callback_handler: CallbackHandlerBase = None
129 |     mute_std_log = False
130 |     mute_file_log = False
131 |     LOGGING_FILE_CONFIG = {
132 |         'info.log': {
133 |             'file_size_mb': 2,
134 |             'level': INFO,
135 |             'backup_count': 1,
136 |         },
137 |         'error.log': {
138 |             'file_size_mb': 2,
139 |             'level': ERROR,
140 |             'backup_count': 1,
141 |         },
142 |         'server.log': {
143 |             'file_size_mb': 2,
144 |             'level': INFO,
145 |             'backup_count': 1,
146 |         },
147 |     }
148 |     DEFAULT_LOGGER_FORMATTER = Formatter(
149 |         "%(asctime)s %(levelname)-5s [%(name)s] %(filename)s(%(lineno)s): %(message)s",
150 |         datefmt="%Y-%m-%d %H:%M:%S")
151 |     uvicorn_kwargs: dict = {'access_log': True, 'port': 9901}
152 |     # check interval 60s, so format do use %M , backup every 12 hours. this pattern may miss for crawl cost more than 60s.
153 |     # db_backup_time: str = '%H:%M==00:00|%H:%M==12:00'
154 |     db_backup_time: str = '%H:%M==00:00'
155 |     db_backup_count: int = 4
156 |     db_backup_function: Callable[..., Any] = None
157 |     exception_handlers: list = [
158 |         (Exception, exception_handler),
159 |     ]
160 |     middlewares = [
161 |         {
162 |             'middleware_class': BaseHTTPMiddleware,
163 |             'dispatch': auth_checker
164 |         },
165 |         {
166 |             'middleware_class': GZipMiddleware,
167 |             'minimum_size': 1000
168 |         },
169 |     ]
170 |     md5_cache_maxsize = 128
171 |     query_groups_cache_maxsize = 128
172 |     query_group_task_ids_cache_maxsize = 128
173 |     query_task_ids_cache_maxsize = 128
174 |     query_tasks_cache_maxsize = 128
175 |     query_feeds_cache_maxsize = 128
176 |     metas_cache_maxsize = 128
177 |     sign_cache_maxsize = 128
178 |     _md5 = _md5
179 |     get_sign = get_sign
180 |     background_task = None
181 |     background_funcs: List[Callable] = []
182 |     is_shutdown = False
183 |     custom_links = [
184 |         {
185 |             'label': 'Auth',
186 |             'url': '/auth',
187 |             'desc': 'change your password',
188 |         },
189 |         {
190 |             'label': 'Logs',
191 |             'url': '/log',
192 |             'desc': 'view the logs',
193 |         },
194 |         {
195 |             'label': 'Docs',
196 |             'url': '/docs',
197 |             'desc': 'read the docs',
198 |         },
199 |         {
200 |             'label': 'Groups',
201 |             'url': '/groups',
202 |             'desc': 'admin the groups',
203 |         },
204 |     ]
205 |     # custom_tabs = [{'name': 'apis', 'label': 'API', 'url': '/docs'}]
206 |     custom_tabs: List[Dict] = []
207 |     COLLATION: str = None
208 |     cookie_max_age = 86400 * 7
209 |     default_page_size = 20
210 |     TEXT_SLICE_LENGTH = 200
211 | 
212 |     @classmethod
213 |     def get_route(cls, path, **kwargs):
214 |         params_string = unparse_qsl([
215 |             (k, str(v)) for k, v in kwargs.items() if str(v)
216 |         ])
217 |         sign = cls.get_sign(path, params_string)[1]
218 |         if params_string:
219 |             result = f'{path}?{params_string}&sign={sign}'
220 |         else:
221 |             result = f'{path}?sign={sign}'
222 |         return result
223 | 
224 |     @classmethod
225 |     def add_custom_tabs(cls, label, url, name=None, desc=None):
226 |         # desc is nonsense
227 |         assert name or label
228 |         cls.custom_tabs.append({
229 |             'label': label,
230 |             'name': name or label,
231 |             'url': url,
232 |             'desc': desc
233 |         })
234 | 
235 |     @classmethod
236 |     def add_custom_links(cls, url, name, label=None, desc=None):
237 |         assert name or label
238 |         cls.custom_tabs.append({
239 |             'name': name or label,
240 |             'label': label or name,
241 |             'url': url,
242 |             'desc': desc
243 |         })
244 | 
245 |     @classmethod
246 |     def setup_middleware(cls, app):
247 |         for middleware in cls.middlewares:
248 |             app.add_middleware(**middleware)
249 | 
250 | 
251 | def md5(obj, n=32, with_salt=True):
252 |     if not with_salt:
253 |         return Config._md5(str(obj).encode('utf-8'), n=n, skip_encode=True)
254 |     salt = Config.md5_salt
255 |     if not salt:
256 |         raise ValueError('Config.md5_salt should not be null')
257 |     return Config._md5(f'{obj}{salt}'.encode('utf-8'), n=n)
258 | 
259 | 
260 | async def md5_checker(obj, target, freq=False):
261 |     if freq:
262 |         async with Config.check_pwd_freq:
263 |             # anti guessing password
264 |             return md5(obj) == target
265 |     else:
266 |         # may get a cache
267 |         return md5(obj) == target
268 | 


--------------------------------------------------------------------------------
/watchdogs/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from datetime import datetime
  3 | from inspect import isawaitable
  4 | from json import dumps, loads
  5 | from logging import getLogger
  6 | from sys import _getframe
  7 | from traceback import format_exc
  8 | from typing import Optional
  9 | from xml.sax.saxutils import escape
 10 | 
 11 | logger = getLogger('watchdogs')
 12 | 
 13 | 
 14 | def format_size(size, rounded=2):
 15 |     unit = 'B'
 16 |     for _unit in ['B', 'KB', 'MB', 'GB']:
 17 |         unit = _unit
 18 |         if size > 1024:
 19 |             size = size / 1024
 20 |         else:
 21 |             break
 22 |     return f'{round(size, rounded)} {unit}'
 23 | 
 24 | 
 25 | async def ensure_await_result(result):
 26 |     if isawaitable(result):
 27 |         return await result
 28 |     return result
 29 | 
 30 | 
 31 | def _check_work_time(work_hours, now: Optional[datetime] = None):
 32 |     now = now or datetime.now()
 33 |     if '==' in work_hours:
 34 |         # check work days, using strftime
 35 |         fmt, target = work_hours.split('==')
 36 |         current = now.strftime(fmt)
 37 |         # check current time format equals to target
 38 |         return current == target
 39 |     elif '!=' in work_hours:
 40 |         # check work days, using strftime
 41 |         fmt, target = work_hours.split('!=')
 42 |         current = now.strftime(fmt)
 43 |         # check current time format equals to target
 44 |         return current != target
 45 |     else:
 46 |         # other hours format
 47 |         current_hour = now.hour
 48 |         if work_hours[0] == '[' and work_hours[-1] == ']':
 49 |             work_hours_list = sorted(loads(work_hours))
 50 |         else:
 51 |             nums = [int(num) for num in re.findall(r'\d+', work_hours)]
 52 |             work_hours_list = sorted(range(*nums))
 53 |         # check if current_hour is work hour
 54 |         return current_hour in work_hours_list
 55 | 
 56 | 
 57 | def check_work_time(work_hours, now: Optional[datetime] = None):
 58 |     """Check time if fit work_hours.
 59 | 
 60 |     :: Test Code
 61 | 
 62 |         from watchdogs.utils import check_work_time, datetime
 63 | 
 64 |         now = datetime.strptime('2020-03-14 11:47:32', '%Y-%m-%d %H:%M:%S')
 65 | 
 66 |         oks = [
 67 |             '0, 24',
 68 |             '[1, 2, 3, 11]',
 69 |             '[1, 2, 3, 11];%Y==2020',
 70 |             '%d==14',
 71 |             '16, 24|[11]',
 72 |             '16, 24|%M==47',
 73 |             '%M==46|%M==47',
 74 |             '%H!=11|%d!=12',
 75 |             '16, 24|%M!=41',
 76 |         ]
 77 | 
 78 |         for work_hours in oks:
 79 |             ok = check_work_time(work_hours, now)
 80 |             print(ok, work_hours)
 81 |             assert ok
 82 | 
 83 |         no_oks = [
 84 |             '0, 5',
 85 |             '[1, 2, 3, 5]',
 86 |             '[1, 2, 3, 11];%Y==2021',
 87 |             '%d==11',
 88 |             '16, 24|[12]',
 89 |             '%M==17|16, 24',
 90 |             '%M==46|[1, 2, 3]',
 91 |             '%H!=11&%d!=12',
 92 |             '%M!=46;%M!=47',
 93 |         ]
 94 | 
 95 |         for work_hours in no_oks:
 96 |             ok = check_work_time(work_hours, now)
 97 |             print(ok, work_hours)
 98 |             assert not ok
 99 | 
100 | 
101 |     """
102 |     now = now or datetime.now()
103 |     if '|' in work_hours:
104 |         if '&' in work_hours or ';' in work_hours:
105 |             raise ValueError('| can not use with "&" or ";"')
106 |         return any((_check_work_time(partial_work_hour, now)
107 |                     for partial_work_hour in work_hours.split('|')))
108 |     else:
109 |         if ('&' in work_hours or ';' in work_hours) and '|' in work_hours:
110 |             raise ValueError('| can not use with "&" or ";"')
111 |         return all((_check_work_time(partial_work_hour, now)
112 |                     for partial_work_hour in re.split('&|;', work_hours)))
113 | 
114 | 
115 | def get_watchdog_result(item):
116 |     """
117 |     Parse result format like:
118 |     {'text': 'xxx'}
119 |     {'text': 'xxx', 'url': 'xxx'}
120 |     {'rule_name': {'text': 'xxx'}}
121 |     {'__result__': {'rule_name': {'text': 'xxx'}}}
122 | 
123 | def test_result_schema():
124 |     # standard result
125 |     result = get_watchdog_result({
126 |         'url': 'https://www.python.org/dev/peps/pep-0001',
127 |         'text': 'text'
128 |     })
129 |     # print(result)
130 |     assert result == {
131 |         'url': 'https://www.python.org/dev/peps/pep-0001',
132 |         'text': 'text'
133 |     }
134 |     # only text
135 |     result = get_watchdog_result('https://www.python.org/dev/peps/pep-0001')
136 |     # print(result)
137 |     assert result == {'text': 'text not found'}
138 |     # embed request
139 |     result = get_watchdog_result({
140 |         '__request__': 'https://www.python.org/dev/peps/pep-0001',
141 |         '__result__': {
142 |             'detail': {
143 |                 'text': 'PEP 1 -- PEP Purpose and Guidelines'
144 |             }
145 |         }
146 |     })
147 |     # print(result)
148 |     assert result == {'text': 'PEP 1 -- PEP Purpose and Guidelines'}
149 |     # embed request list
150 |     result = get_watchdog_result({
151 |         '__request__': 'https://www.python.org/dev/peps/pep-0001',
152 |         '__result__': {
153 |             'detail': [{
154 |                 'text': 'PEP 1 -- PEP Purpose and Guidelines'
155 |             }]
156 |         }
157 |     })
158 |     # print(result)
159 |     assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}]
160 |     # embed request list2
161 |     result = get_watchdog_result({
162 |         '__request__': 'https://www.python.org/dev/peps/pep-0001',
163 |         '__result__': {
164 |             'rule_name': {
165 |                 '__result__': {
166 |                     'detail': [{
167 |                         'text': 'PEP 1 -- PEP Purpose and Guidelines'
168 |                     }]
169 |                 }
170 |             }
171 |         }
172 |     })
173 |     # print(result)
174 |     assert result == [{'text': 'PEP 1 -- PEP Purpose and Guidelines'}]
175 |     # child rule result
176 |     result = get_watchdog_result({
177 |         'url': 'https://www.python.org/dev/peps/pep-0001',
178 |         'text': 'text'
179 |     })
180 |     # print(result)
181 |     assert result == {
182 |         'text': 'text',
183 |         'url': 'https://www.python.org/dev/peps/pep-0001'
184 |     }
185 |     result = get_watchdog_result({
186 |         'list': {
187 |             'detail': [{
188 |                 'text': 'Wake up to WonderWidgets!',
189 |                 'url': 'all'
190 |             }, {
191 |                 'text': 'Overview',
192 |                 'url': 'all'
193 |             }]
194 |         }
195 |     })
196 |     # print(result)
197 |     assert result == [{
198 |         'text': 'Wake up to WonderWidgets!',
199 |         'url': 'all'
200 |     }, {
201 |         'text': 'Overview',
202 |         'url': 'all'
203 |     }]
204 | 
205 |     """
206 |     result = {'text': 'text not found'}
207 |     if isinstance(item, dict):
208 |         __result__ = item.pop('__result__', None)
209 |         if __result__:
210 |             # may be __result__ > __result__ > __result__ nested...
211 |             return get_watchdog_result(__result__.popitem()[1])
212 |         text = item.get('text')
213 |         if text is None:
214 |             return get_watchdog_result(item.popitem()[1])
215 |         result = {'text': str(text)}
216 |         for key in ['__key__', 'unique', 'key', 'cover', 'url', 'title']:
217 |             if key in item:
218 |                 value = item[key]
219 |                 if value and str(value):
220 |                     result[key] = str(value)
221 | 
222 |     elif isinstance(item, (list, tuple)):
223 |         result = [get_watchdog_result(i) for i in item]
224 |     return result
225 | 
226 | 
227 | class SoloLock:
228 | 
229 |     def __init__(self):
230 |         self.runnings: set = set()
231 | 
232 |     @property
233 |     def current_name(self):
234 |         return _getframe(2).f_code.co_name
235 | 
236 |     def acquire(self, name=None):
237 |         name = name or self.current_name
238 |         if name in self.runnings:
239 |             raise RuntimeError(f'[{name}] is still running.')
240 |         self.runnings.add(name)
241 | 
242 |     def release(self, name=None):
243 |         name = name or self.current_name
244 |         self.runnings.discard(name)
245 | 
246 |     def __enter__(self):
247 |         self.acquire(self.current_name)
248 |         return self
249 | 
250 |     def __exit__(self, *args):
251 |         self.release(self.current_name)
252 |         return self
253 | 
254 | 
255 | async def try_catch(func, *args, **kwargs):
256 |     try:
257 |         return await ensure_await_result(func(*args, **kwargs))
258 |     except BaseException as err:
259 |         logger.error(
260 |             f'Catch an error while running {func.__name__}: {format_exc()}')
261 |         return err
262 | 
263 | 
264 | def ignore_error(func, *args, **kwargs):
265 |     try:
266 |         return func(*args, **kwargs)
267 |     except BaseException as err:
268 |         return err
269 | 
270 | 
271 | def gen_rss(data):
272 |     nodes = []
273 |     channel = data['channel']
274 |     item_keys = ['title', 'description', 'link', 'guid', 'pubDate']
275 |     for item in data['items']:
276 |         item_nodes = []
277 |         for key in item_keys:
278 |             value = item.get(key)
279 |             if value:
280 |                 item_nodes.append(f'<{key}>{escape(value)}</{key}>')
281 |         nodes.append(''.join(item_nodes))
282 |     items_string = ''.join((f'<item>{tmp}</item>' for tmp in nodes))
283 |     return rf'''<?xml version="1.0" encoding="UTF-8" ?>
284 | <rss version="2.0">
285 | <channel>
286 |   <title>{channel['title']}</title>
287 |   <link>{channel['link']}</link>
288 |   <description>{channel['description']}</description>
289 |   <image>
290 |     <url>{channel['link']}/static/img/favicon.svg</url>
291 |     <title>{channel['title']}</title>
292 |     <link>{channel['link']}</link>
293 |     <width>32</width>
294 |     <height>32</height>
295 |    </image>
296 |   {items_string}
297 | </channel>
298 | </rss>
299 | '''
300 | 
301 | 
302 | def get_result_key(result: dict):
303 |     key = result.get('__key__', result.get('key'))
304 |     if key:
305 |         return key
306 |     else:
307 |         return dumps(result, sort_keys=True)
308 | 
309 | 
310 | solo = SoloLock()
311 | 


--------------------------------------------------------------------------------
/watchdogs/static/js/clipboard.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 |  * clipboard.js v2.0.4
3 |  * https://zenorocha.github.io/clipboard.js
4 |  * 
5 |  * Licensed MIT © Zeno Rocha
6 |  */
7 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(t,o.key,o)}}return function(t,e,n){return e&&o(t.prototype,e),n&&o(t,n),t}}(),a=o(n(1)),c=o(n(3)),u=o(n(4));function o(t){return t&&t.__esModule?t:{default:t}}var l=function(t){function o(t,e){!function(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}(this,o);var n=function(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}(this,(o.__proto__||Object.getPrototypeOf(o)).call(this));return n.resolveOptions(e),n.listenClick(t),n}return function(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}(o,c.default),i(o,[{key:"resolveOptions",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{};this.action="function"==typeof t.action?t.action:this.defaultAction,this.target="function"==typeof t.target?t.target:this.defaultTarget,this.text="function"==typeof t.text?t.text:this.defaultText,this.container="object"===r(t.container)?t.container:document.body}},{key:"listenClick",value:function(t){var e=this;this.listener=(0,u.default)(t,"click",function(t){return e.onClick(t)})}},{key:"onClick",value:function(t){var e=t.delegateTarget||t.currentTarget;this.clipboardAction&&(this.clipboardAction=null),this.clipboardAction=new a.default({action:this.action(e),target:this.target(e),text:this.text(e),container:this.container,trigger:e,emitter:this})}},{key:"defaultAction",value:function(t){return s("action",t)}},{key:"defaultTarget",value:function(t){var e=s("target",t);if(e)return document.querySelector(e)}},{key:"defaultText",value:function(t){return s("text",t)}},{key:"destroy",value:function(){this.listener.destroy(),this.clipboardAction&&(this.clipboardAction.destroy(),this.clipboardAction=null)}}],[{key:"isSupported",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:["copy","cut"],e="string"==typeof t?[t]:t,n=!!document.queryCommandSupported;return e.forEach(function(t){n=n&&!!document.queryCommandSupported(t)}),n}}]),o}();function s(t,e){var n="data-clipboard-"+t;if(e.hasAttribute(n))return e.getAttribute(n)}t.exports=l},function(t,e,n){"use strict";var o,r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(t,o.key,o)}}return function(t,e,n){return e&&o(t.prototype,e),n&&o(t,n),t}}(),a=n(2),c=(o=a)&&o.__esModule?o:{default:o};var u=function(){function e(t){!function(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}(this,e),this.resolveOptions(t),this.initSelection()}return i(e,[{key:"resolveOptions",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{};this.action=t.action,this.container=t.container,this.emitter=t.emitter,this.target=t.target,this.text=t.text,this.trigger=t.trigger,this.selectedText=""}},{key:"initSelection",value:function(){this.text?this.selectFake():this.target&&this.selectTarget()}},{key:"selectFake",value:function(){var t=this,e="rtl"==document.documentElement.getAttribute("dir");this.removeFake(),this.fakeHandlerCallback=function(){return t.removeFake()},this.fakeHandler=this.container.addEventListener("click",this.fakeHandlerCallback)||!0,this.fakeElem=document.createElement("textarea"),this.fakeElem.style.fontSize="12pt",this.fakeElem.style.border="0",this.fakeElem.style.padding="0",this.fakeElem.style.margin="0",this.fakeElem.style.position="absolute",this.fakeElem.style[e?"right":"left"]="-9999px";var n=window.pageYOffset||document.documentElement.scrollTop;this.fakeElem.style.top=n+"px",this.fakeElem.setAttribute("readonly",""),this.fakeElem.value=this.text,this.container.appendChild(this.fakeElem),this.selectedText=(0,c.default)(this.fakeElem),this.copyText()}},{key:"removeFake",value:function(){this.fakeHandler&&(this.container.removeEventListener("click",this.fakeHandlerCallback),this.fakeHandler=null,this.fakeHandlerCallback=null),this.fakeElem&&(this.container.removeChild(this.fakeElem),this.fakeElem=null)}},{key:"selectTarget",value:function(){this.selectedText=(0,c.default)(this.target),this.copyText()}},{key:"copyText",value:function(){var e=void 0;try{e=document.execCommand(this.action)}catch(t){e=!1}this.handleResult(e)}},{key:"handleResult",value:function(t){this.emitter.emit(t?"success":"error",{action:this.action,text:this.selectedText,trigger:this.trigger,clearSelection:this.clearSelection.bind(this)})}},{key:"clearSelection",value:function(){this.trigger&&this.trigger.focus(),window.getSelection().removeAllRanges()}},{key:"destroy",value:function(){this.removeFake()}},{key:"action",set:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:"copy";if(this._action=t,"copy"!==this._action&&"cut"!==this._action)throw new Error('Invalid "action" value, use either "copy" or "cut"')},get:function(){return this._action}},{key:"target",set:function(t){if(void 0!==t){if(!t||"object"!==(void 0===t?"undefined":r(t))||1!==t.nodeType)throw new Error('Invalid "target" value, use a valid Element');if("copy"===this.action&&t.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if("cut"===this.action&&(t.hasAttribute("readonly")||t.hasAttribute("disabled")))throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes');this._target=t}},get:function(){return this._target}}]),e}();t.exports=u},function(t,e){t.exports=function(t){var e;if("SELECT"===t.nodeName)t.focus(),e=t.value;else if("INPUT"===t.nodeName||"TEXTAREA"===t.nodeName){var n=t.hasAttribute("readonly");n||t.setAttribute("readonly",""),t.select(),t.setSelectionRange(0,t.value.length),n||t.removeAttribute("readonly"),e=t.value}else{t.hasAttribute("contenteditable")&&t.focus();var o=window.getSelection(),r=document.createRange();r.selectNodeContents(t),o.removeAllRanges(),o.addRange(r),e=o.toString()}return e}},function(t,e){function n(){}n.prototype={on:function(t,e,n){var o=this.e||(this.e={});return(o[t]||(o[t]=[])).push({fn:e,ctx:n}),this},once:function(t,e,n){var o=this;function r(){o.off(t,r),e.apply(n,arguments)}return r._=e,this.on(t,r,n)},emit:function(t){for(var e=[].slice.call(arguments,1),n=((this.e||(this.e={}))[t]||[]).slice(),o=0,r=n.length;o<r;o++)n[o].fn.apply(n[o].ctx,e);return this},off:function(t,e){var n=this.e||(this.e={}),o=n[t],r=[];if(o&&e)for(var i=0,a=o.length;i<a;i++)o[i].fn!==e&&o[i].fn._!==e&&r.push(o[i]);return r.length?n[t]=r:delete n[t],this}},t.exports=n},function(t,e,n){var d=n(5),h=n(6);t.exports=function(t,e,n){if(!t&&!e&&!n)throw new Error("Missing required arguments");if(!d.string(e))throw new TypeError("Second argument must be a String");if(!d.fn(n))throw new TypeError("Third argument must be a Function");if(d.node(t))return s=e,f=n,(l=t).addEventListener(s,f),{destroy:function(){l.removeEventListener(s,f)}};if(d.nodeList(t))return a=t,c=e,u=n,Array.prototype.forEach.call(a,function(t){t.addEventListener(c,u)}),{destroy:function(){Array.prototype.forEach.call(a,function(t){t.removeEventListener(c,u)})}};if(d.string(t))return o=t,r=e,i=n,h(document.body,o,r,i);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList");var o,r,i,a,c,u,l,s,f}},function(t,n){n.node=function(t){return void 0!==t&&t instanceof HTMLElement&&1===t.nodeType},n.nodeList=function(t){var e=Object.prototype.toString.call(t);return void 0!==t&&("[object NodeList]"===e||"[object HTMLCollection]"===e)&&"length"in t&&(0===t.length||n.node(t[0]))},n.string=function(t){return"string"==typeof t||t instanceof String},n.fn=function(t){return"[object Function]"===Object.prototype.toString.call(t)}},function(t,e,n){var a=n(7);function i(t,e,n,o,r){var i=function(e,n,t,o){return function(t){t.delegateTarget=a(t.target,n),t.delegateTarget&&o.call(e,t)}}.apply(this,arguments);return t.addEventListener(n,i,r),{destroy:function(){t.removeEventListener(n,i,r)}}}t.exports=function(t,e,n,o,r){return"function"==typeof t.addEventListener?i.apply(null,arguments):"function"==typeof n?i.bind(null,document).apply(null,arguments):("string"==typeof t&&(t=document.querySelectorAll(t)),Array.prototype.map.call(t,function(t){return i(t,e,n,o,r)}))}},function(t,e){if("undefined"!=typeof Element&&!Element.prototype.matches){var n=Element.prototype;n.matches=n.matchesSelector||n.mozMatchesSelector||n.msMatchesSelector||n.oMatchesSelector||n.webkitMatchesSelector}t.exports=function(t,e){for(;t&&9!==t.nodeType;){if("function"==typeof t.matches&&t.matches(e))return t;t=t.parentNode}}}])});


--------------------------------------------------------------------------------
/watchdogs/static/js/watchdogs.min.js:
--------------------------------------------------------------------------------
1 | var Main={data:()=>({activeName:"tasks",uniparser_iframe_loaded:!1,task_info_visible:!1,rule_info_visible:!1,current_host_rule:{},new_task_form:{},has_more:!0,task_list:[],current_page:0,host_list:[],visible_host_list:[],current_host:"",tag_types:["","success","info","warning","danger"],query_tasks_args:{order_by:"last_change_time",sort:"desc",tag:""},callback_workers:{},custom_links:[],custom_tabs:[],current_cb_doc:"",init_iframe_rule_json:"",clicked_tab_names:{}}),methods:{add_new_task(){try{JSON.parse(this.new_task_form.result_list)}catch(e){this.$alert("Invalid JSON for result_list.");return}try{JSON.parse(this.new_task_form.request_args)}catch(s){this.$alert("Invalid JSON for request_args.");return}this.task_info_visible=!1;let t=JSON.stringify(this.new_task_form);this.$http.post("add_new_task",t).then(e=>{var s=e.body;"ok"==s.msg?(this.$message({message:"Update task "+this.new_task_form.name+" success: "+s.msg,type:"success"}),this.reload_tasks()):this.$message.error({message:"Update task "+this.new_task_form.name+" failed: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},init_iframe_crawler_rule(e){e?this.sub_app.new_rule_json=e:/httpbin\.org\/html/g.test(this.sub_app.new_rule_json)?this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"https://importpython.com/blog/feed/","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["xml","channel>item>title","$text"],["python","getitem","[0]"]],"child_rules":""},{"name":"url","chain_rules":[["xml","channel>item>link","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^https?://importpython.com/blog/feed/$","encoding":""}':this.sub_app.new_rule_json='{"name":"","request_args":{"method":"get","url":"http://httpbin.org/html","headers":{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}},"parse_rules":[{"name":"text","chain_rules":[["css","body h1","$text"],["python","getitem","[0]"]],"child_rules":""}],"regex":"^http://httpbin.org/html$","encoding":""}',this.sub_app.input_object="",this.sub_app.request_status="",this.sub_app.load_rule()},load_rule(e){this.sub_app.new_rule_json=e,this.sub_app.load_rule()},view_host_by_req(e){let s=JSON.parse(e).url;if(!s){this.$alert("request_args.url should not be null");return}document.getElementById("tab-rules").click(),setTimeout(()=>{this.current_host=new URL(s).hostname},0),this.task_info_visible=!1},view_crawler_rule_by_req(e){if(!e){this.$alert("request_args should not be null");return}this.$http.post("find_crawler_rule",e).then(e=>{var s=e.body;if("ok"==s.msg){let t=JSON.parse(s.result);this.view_crawler_rule(t),this.task_info_visible=!1}else this.$message.error({message:"rule not find in db: "+s.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},view_crawler_rule(e){this.rule_info_visible=!1,document.getElementById("tab-new").click(),this.uniparser_iframe_loaded?this.init_iframe_crawler_rule(JSON.stringify(e)):this.init_iframe_rule_json=JSON.stringify(e)},edit_crawler_rule(e){this.$prompt("","Edit Crawler JSON",{confirmButtonText:"OK",cancelButtonText:"Cancel",center:!0,inputType:"textarea",closeOnClickModal:!1,inputValue:JSON.stringify(e,null,2)}).then(({value:e})=>{this.process_crawler_rule("add",JSON.parse(e),0)}).catch(e=>{this.$message({type:"error",message:e})})},process_crawler_rule(e,s,t){let r=JSON.stringify(s||JSON.parse(this.sub_app.current_crawler_rule_json)),a="crawler_rule."+e;1==t&&(a+="?force=1"),this.$http.post(a,r).then(t=>{var r=t.body;"ok"==r.msg?(this.$message({message:e+" rule success",type:"success"}),"pop"==e&&r.result&&this.show_host_rule(this.current_host_rule.host)):"add"==e&&/matched more than 1 rule/g.test(r.msg)?this.$confirm("Failed for url matched more than 1 rule, overwrite it?","Confirm",{confirmButtonText:"Yes",cancelButtonText:"No",type:"error"}).then(()=>{this.process_crawler_rule(e,s,1)}).catch(()=>{this.$message({type:"info",message:"Adding rule canceled."})}):this.$message.error({message:e+" rule failed: "+r.msg,duration:0,showClose:!0})},e=>{this.$message.error({message:"connect failed: "+e.status,duration:0,showClose:!0})})},show_form_add_new_task(e){if(e){let s="";try{s=this.sub_app.crawler_rule.name}catch(t){console.log(t)}this.new_task_form={task_id:null,name:s,enable:1,tag:"default",error:"",request_args:"",origin_url:"",interval:300,work_hours:"0, 24",max_result_count:30,result_list:"[]",custom_info:""};let r=JSON.parse(this.sub_app.current_crawler_rule_json);this.new_task_form.request_args=JSON.stringify(r.request_args),this.new_task_form.origin_url=r.request_args.url||""}this.task_info_visible=!0},change_enable(e){this.$http.get("enable_task",{params:{task_id:e.task_id,enable:e.enable}}).then(e=>{var s=e.body;"ok"!=s.msg&&this.$message.error({message:"Update enable failed: "+s.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})},sort_change(e){this.query_tasks_args={order_by:e.column.label,sort:(e.column.order||"").replace("ending","")},this.reload_tasks()},reload_tasks(){this.task_list=[],this.current_page=0,this.load_tasks()},load_tasks(){let e=new URLSearchParams(window.location.search).get("tag");e?this.query_tasks_args.tag=e:this.query_tasks_args.tag="",current_page=this.current_page+1,this.query_tasks_args.page=current_page,this.$http.get("load_tasks",{params:this.query_tasks_args}).then(e=>{var s=e.body;"ok"==s.msg?(s.tasks.forEach(e=>{this.task_list.push(e)}),this.has_more=s.has_more,this.current_page=current_page):(this.$message.error({message:"Loading tasks failed: "+s.msg}),this.has_more=s.has_more)},e=>{this.$message.error({message:"connect failed: "+e.status})})},load_hosts(){this.$http.get("load_hosts",{params:{host:this.current_host}}).then(e=>{var s=e.body;this.current_host=s.host||"",this.host_list=s.hosts,this.visible_host_list=this.host_list},e=>{this.$message.error({message:"connect failed: "+e.status})})},init_iframe(){this.sub_app&&(this.init_iframe_crawler_rule(this.init_iframe_rule_json),this.init_iframe_rule_json&&(this.$message.success({message:"Rule loaded."}),this.init_iframe_rule_json=""),this.uniparser_iframe_loaded=!0)},handleClick(e){e.name in this.clicked_tab_names||(this.clicked_tab_names[e.name]=1,"rules"==e.name&&this.load_hosts())},escape_html:e=>e?e.replace(/[&<>'"]/g,e=>({"&":"&amp;","<":"&lt;",">":"&gt;","'":"&#39;",'"':"&quot;"})[e]||e):"",show_time(e){var s='<table style="text-align: left;margin: 0 0 0 20%;font-weight: bold;">';JSON.parse(e.result_list||"[]"),s+='<tr><td>last_check_time</td><td class="time-td">'+e.last_check_time.replace(/\..*/,"").replace("T"," ")+"</td></tr>",s+='<tr><td>next_check_time</td><td class="time-td">'+e.next_check_time.replace(/\..*/,"").replace("T"," ")+"</td></tr>",s+='<tr><td>last_change_time</td><td class="time-td">'+e.last_change_time.replace(/\..*/,"").replace("T"," ")+"</td></tr>",s+="</table>",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},get_latest_result(e,s=80){try{let t=JSON.parse(e);return t.title||t.text.slice(0,s)}catch(r){return e}},show_result_list(e){var s="<table>";JSON.parse(e.result_list||"[]").forEach(e=>{if((result=e.result).url)var t='href="'+(result.url||"")+'"';else var t="";s+='<tr><td class="time-td">'+e.time+'</td><td><a target="_blank" '+t+">"+this.escape_html(result.title||result.text)+"</a></td></tr>"}),s+="</table>",this.$alert(s,"Task result list: "+e.name,{confirmButtonText:"OK",center:!0,dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0})},force_crawl(e,s){this.$http.get("force_crawl",{params:{task_name:s.name}}).then(t=>{var r=t.body;if("ok"==r.msg){let a=r.task;Vue.set(this.task_list,e,a),a.error?this.$message.error({message:"Crawl task "+s.name+" "+a.error}):this.$message.success({message:"Crawl task "+s.name+" success"})}else this.$message.error({message:"Crawl task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"force_crawl connect failed: "+e.status})})},row_db_click(e){this.update_task(e)},show_task_error(e){app.$alert(e.error,"Crawler Error",{closeOnClickModal:!0,closeOnPressEscape:!0,center:!0})},update_task(e){this.new_task_form={task_id:e.task_id,name:e.name,enable:e.enable,tag:e.tag,request_args:e.request_args,origin_url:e.origin_url,interval:e.interval,work_hours:e.work_hours,max_result_count:e.max_result_count,result_list:e.result_list||"[]",custom_info:e.custom_info},this.show_form_add_new_task(!1)},delete_task(e,s){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_task",{params:{task_id:s.task_id}}).then(t=>{var r=t.body;"ok"==r.msg?(this.$message.success({message:"Delete task "+s.name+" success"}),this.task_list.splice(e,1)):this.$message.error({message:"Delete task "+s.name+" failed: "+r.msg})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},delete_host_rule(e){this.$confirm("Are you sure?","Confirm",{confirmButtonText:"Delete",cancelButtonText:"Cancel",type:"warning"}).then(()=>{this.$http.get("delete_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.$message.success({message:"Delete host "+e+" rule success"}),this.current_host_rule={},this.rule_info_visible=!1,this.load_hosts()):this.$message.error({message:"Delete host "+e+" rule failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}).catch(()=>{this.$message({type:"info",message:"Canceled"})})},show_host_rule(e){this.$http.get("get_host_rule",{params:{host:e}}).then(s=>{var t=s.body;"ok"==t.msg?(this.current_host_rule=t.host_rule,this.rule_info_visible=!0):this.$message.error({message:"get_host_rule "+e+" failed: "+JSON.stringify(t)})},e=>{this.$message.error({message:"connect failed: "+e.status})})},show_work_hours_doc(){let e=`<textarea style="height: ${3*window.innerHeight/4}px;width: 100%;">${this.work_hours_doc}</textarea  >`;this.$alert(e,"work_hours format doc",{dangerouslyUseHTMLString:!0,closeOnClickModal:!0,closeOnPressEscape:!0,customClass:"work_hours_doc"})},check_error_task({row:e,rowIndex:s}){if(e.error)return"warning-row"},click_cb_name(e){this.current_cb_doc=this.callback_workers[e],this.new_task_form.custom_info=e+":"},update_frequency(){let e=this.current_host_rule.host,s=this.current_host_rule.n||0,t=this.current_host_rule.interval||0;this.$http.get("update_host_freq",{params:{host:e,n:s,interval:t}}).then(r=>{var a=r.body;"ok"==a.msg?(this.$message({message:"Update frequency "+e+": "+a.msg,type:"success"}),this.current_host_rule.n=s,this.current_host_rule.interval=t):this.$message.error({message:"update_frequency "+e+" failed: "+JSON.stringify(a)})},e=>{this.$message.error({message:"connect failed: "+e.status})})}},watch:{current_host:function(e){this.visible_host_list=[],/^https?:\/\//g.test(e)&&(e=new URL(e).hostname,this.current_host=e),this.host_list.forEach(s=>{s.name.includes(e)&&this.visible_host_list.push(s)})},task_info_visible:function(e){e||(this.current_cb_doc="")}},computed:{uni_iframe:()=>document.getElementById("uni_iframe"),sub_app(){let e=this.uni_iframe;if(e)return e.contentWindow.app}}},vue_app=Vue.extend(Main),app=new vue_app({delimiters:["${","}"]}).$mount("#app");(()=>{var e;let s=document.getElementById("init_vars"),t=JSON.parse(window.atob(s.innerHTML));Object.keys(t).forEach(e=>{app[e]=t[e]}),s.parentNode.removeChild(s),new IntersectionObserver(e=>{!(e[0].intersectionRatio<=0)&&app.has_more&&app.load_tasks()}).observe(document.getElementById("auto_load"))})();
2 | 


--------------------------------------------------------------------------------
/watchdogs/settings.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from asyncio import ensure_future, get_event_loop
  3 | from datetime import datetime
  4 | from functools import lru_cache
  5 | from json import dumps, loads
  6 | from logging.handlers import RotatingFileHandler
  7 | 
  8 | from frequency_controller import AsyncFrequency
  9 | from uniparser.parsers import Uniparser
 10 | 
 11 | from .config import Config, NotSet, ensure_dir, md5
 12 | 
 13 | 
 14 | def get_valid_value(values: list, default=None, invalid=NotSet):
 15 |     for value in values:
 16 |         if value is not invalid:
 17 |             return value
 18 |     return default
 19 | 
 20 | 
 21 | def get_file_handler(file_name,
 22 |                      file_size_mb=2,
 23 |                      backup_count=1,
 24 |                      level=logging.INFO):
 25 |     handler = RotatingFileHandler(
 26 |         Config.CONFIG_DIR / file_name,
 27 |         maxBytes=1024 * 1024 * Config.LOGGING_FILE_CONFIG.get(
 28 |             file_name, {}).get('file_size_mb', file_size_mb),
 29 |         backupCount=Config.LOGGING_FILE_CONFIG.get(file_name, {}).get(
 30 |             'backup_count', backup_count),
 31 |         encoding=Config.ENCODING)
 32 |     handler.setLevel(
 33 |         Config.LOGGING_FILE_CONFIG.get(file_name, {}).get('level', level))
 34 |     handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER)
 35 |     return handler
 36 | 
 37 | 
 38 | def get_stream_handler(level=logging.INFO):
 39 |     handler = logging.StreamHandler()
 40 |     handler.setLevel(level)
 41 |     handler.setFormatter(Config.DEFAULT_LOGGER_FORMATTER)
 42 |     return handler
 43 | 
 44 | 
 45 | def setup_logger():
 46 |     watchdogs_logger = logging.getLogger('watchdogs')
 47 |     uniparser_logger = logging.getLogger('uniparser')
 48 |     uvicorn_logger = logging.getLogger('uvicorn')
 49 |     if not Config.mute_file_log:
 50 |         info_handler = get_file_handler('info.log')
 51 |         watchdogs_logger.addHandler(info_handler)
 52 |         uniparser_logger.addHandler(info_handler)
 53 | 
 54 |         error_handler = get_file_handler('error.log')
 55 |         watchdogs_logger.addHandler(error_handler)
 56 |         uniparser_logger.addHandler(error_handler)
 57 | 
 58 |         server_handler = get_file_handler('server.log')
 59 |         uvicorn_logger.addHandler(server_handler)
 60 | 
 61 |     if not Config.mute_std_log:
 62 |         handler = get_stream_handler()
 63 |         watchdogs_logger.addHandler(handler)
 64 |         uniparser_logger.addHandler(handler)
 65 |         uvicorn_logger.addHandler(handler)
 66 |     return watchdogs_logger
 67 | 
 68 | 
 69 | def setup_models():
 70 |     from databases import Database
 71 | 
 72 |     # lazy import models to config cache size, means set cache after run main.init_app
 73 |     from .models import Metas, RuleStorageDB, create_tables
 74 | 
 75 |     Config.db = Database(Config.db_url)
 76 |     Config.rule_db = RuleStorageDB(Config.db)
 77 |     Config.metas = Metas(Config.db)
 78 |     # if Config.db_backup_function is None and Config.db_url.startswith(
 79 |     #         'sqlite:///'):
 80 |     #     Config.db_backup_function = default_db_backup_sqlite
 81 |     create_tables(str(Config.db.url))
 82 | 
 83 | 
 84 | async def setup_uniparser():
 85 |     import base64
 86 |     import binascii
 87 |     import datetime
 88 |     import math
 89 |     import random
 90 |     import re
 91 | 
 92 |     import uniparser.fastapi_ui
 93 |     from torequests.utils import (
 94 |         curlparse,
 95 |         escape,
 96 |         guess_interval,
 97 |         itertools_chain,
 98 |         json,
 99 |         parse_qs,
100 |         parse_qsl,
101 |         ptime,
102 |         quote,
103 |         quote_plus,
104 |         slice_by_size,
105 |         slice_into_pieces,
106 |         split_n,
107 |         timeago,
108 |         ttime,
109 |         unescape,
110 |         unique,
111 |         unquote,
112 |         unquote_plus,
113 |         urljoin,
114 |         urlparse,
115 |         urlsplit,
116 |         urlunparse,
117 |     )
118 |     from uniparser.config import GlobalConfig
119 |     from uniparser.parsers import UDFParser
120 |     from uniparser.utils import TorequestsAiohttpAsyncAdapter
121 |     UDFParser._GLOBALS_ARGS.update({
122 |         're': re,
123 |         'datetime': datetime,
124 |         'curlparse': curlparse,
125 |         'math': math,
126 |         'random': random,
127 |         'escape': escape,
128 |         'guess_interval': guess_interval,
129 |         'itertools_chain': itertools_chain,
130 |         'json': json,
131 |         'parse_qs': parse_qs,
132 |         'parse_qsl': parse_qsl,
133 |         'ptime': ptime,
134 |         'quote': quote,
135 |         'quote_plus': quote_plus,
136 |         'slice_by_size': slice_by_size,
137 |         'slice_into_pieces': slice_into_pieces,
138 |         'split_n': split_n,
139 |         'timeago': timeago,
140 |         'ttime': ttime,
141 |         'unescape': unescape,
142 |         'unique': unique,
143 |         'unquote': unquote,
144 |         'unquote_plus': unquote_plus,
145 |         'urljoin': urljoin,
146 |         'urlparse': urlparse,
147 |         'urlsplit': urlsplit,
148 |         'urlunparse': urlunparse,
149 |         'base64': base64,
150 |         'binascii': binascii,
151 |     })
152 |     GlobalConfig.GLOBAL_TIMEOUT = Config.downloader_timeout
153 |     Uniparser._DEFAULT_ASYNC_FREQUENCY = AsyncFrequency(
154 |         *Config.DEFAULT_HOST_FREQUENCY)
155 |     await load_host_freqs()
156 |     Config.uniparser = Uniparser(
157 |         request_adapter=TorequestsAiohttpAsyncAdapter())
158 |     uniparser.fastapi_ui.views.uni = Config.uniparser
159 | 
160 | 
161 | def setup_cdn_urls(use_default_cdn=False):
162 |     from uniparser.fastapi_ui.views import cdn_urls
163 | 
164 |     if not Config.cdn_urls:
165 |         # while cdn_urls not set, check use default cdn or static files.
166 |         if use_default_cdn:
167 |             # default online cdn
168 |             Config.cdn_urls = {
169 |                 'VUE_JS_CDN': 'https://cdn.staticfile.org/vue/2.6.11/vue.min.js',
170 |                 'ELEMENT_CSS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/theme-chalk/index.css',
171 |                 'ELEMENT_JS_CDN': 'https://cdn.staticfile.org/element-ui/2.13.0/index.js',
172 |                 'VUE_RESOURCE_CDN': 'https://cdn.staticfile.org/vue-resource/1.5.1/vue-resource.min.js',
173 |                 'CLIPBOARDJS_CDN': 'https://cdn.staticfile.org/clipboard.js/2.0.4/clipboard.min.js',
174 |             }
175 |         else:
176 |             # local statics
177 |             Config.cdn_urls = {
178 |                 'VUE_JS_CDN': '/static/js/vue.min.js',
179 |                 'ELEMENT_CSS_CDN': '/static/css/index.css',
180 |                 'ELEMENT_JS_CDN': '/static/js/index.js',
181 |                 'VUE_RESOURCE_CDN': '/static/js/vue-resource.min.js',
182 |                 'CLIPBOARDJS_CDN': '/static/js/clipboard.min.js',
183 |             }
184 |     # overwrite uniparser's cdn
185 |     cdn_urls.update(Config.cdn_urls)
186 | 
187 | 
188 | def setup_lru_cache():
189 |     Config._md5 = lru_cache(maxsize=Config.md5_cache_maxsize)(Config._md5)
190 |     Config.get_sign = lru_cache(maxsize=Config.sign_cache_maxsize)(
191 |         Config.get_sign)
192 | 
193 | 
194 | def setup(use_default_cdn=False):
195 |     setup_logger()
196 |     setup_lru_cache()
197 |     setup_cdn_urls(use_default_cdn=use_default_cdn)
198 |     setup_models()
199 | 
200 | 
201 | async def setup_md5_salt():
202 |     logger = Config.logger
203 |     exist_salt = await Config.metas.get('md5_salt', None)
204 |     if not Config.md5_salt:
205 |         if exist_salt:
206 |             # no need to update
207 |             Config.md5_salt = exist_salt
208 |             return
209 |         else:
210 |             # create new salt
211 |             from uuid import uuid1
212 |             Config.md5_salt = uuid1().hex
213 |     elif Config.md5_salt == exist_salt:
214 |         # no need to update
215 |         return
216 |     # need to update: new md5_salt from settings, or no exist_salt
217 |     logger.critical(f'Setting md5_salt as {Config.md5_salt}, replaced into db.')
218 |     return await Config.metas.set('md5_salt', Config.md5_salt)
219 | 
220 | 
221 | async def setup_crawler():
222 |     from uniparser import Crawler
223 | 
224 |     from .callbacks import CallbackHandler
225 | 
226 |     crawler = Crawler(uniparser=Config.uniparser, storage=Config.rule_db)
227 |     Config.crawler = crawler
228 |     if Config.callback_handler is None:
229 |         Config.callback_handler = CallbackHandler()
230 |     workers = ', '.join(Config.callback_handler.callbacks_dict.keys())
231 |     Config.logger.info(f'Current online callbacks: {workers}')
232 | 
233 | 
234 | async def update_password(password=None):
235 |     if password is not None:
236 |         Config.password = password
237 |     return await Config.metas.set('admin', Config.password)
238 | 
239 | 
240 | async def refresh_token():
241 |     if Config.password:
242 |         await update_password()
243 |         password = Config.password
244 |     else:
245 |         password = await Config.metas.get('admin', '')
246 |     if password:
247 |         Config.watchdog_auth = md5(password)
248 | 
249 | 
250 | async def setup_background():
251 |     from .background import background_loop, db_backup_handler
252 |     from .crawler import crawl_once
253 |     Config.background_funcs.append(crawl_once)
254 |     if Config.db_backup_function:
255 |         Config.background_funcs.append(db_backup_handler)
256 |     Config.background_task = ensure_future(
257 |         background_loop(Config.background_funcs))
258 | 
259 | 
260 | def setup_exception_handlers(app):
261 |     for exc, callback in Config.exception_handlers:
262 |         app.add_exception_handler(exc, callback)
263 | 
264 | 
265 | def mute_noise_logger():
266 |     # uvicorn will set new handler for root logger and access logger after app launched.
267 |     logging.getLogger('').handlers.clear()
268 |     if Config.uvicorn_kwargs['access_log']:
269 |         # fix https://github.com/encode/uvicorn/issues/523
270 |         access_logger = logging.getLogger('uvicorn.access')
271 |         access_logger.propagate = True
272 |         access_logger.handlers.clear()
273 | 
274 | 
275 | async def setup_app(app):
276 |     mute_noise_logger()
277 |     db = Config.db
278 |     if not db:
279 |         raise RuntimeError('No database?')
280 |     await db.connect()
281 |     await setup_md5_salt()
282 |     # refresh_token should be after setup_md5_salt
283 |     await refresh_token()
284 |     setup_exception_handlers(app)
285 |     # 1
286 |     await setup_uniparser()
287 |     # 2
288 |     await setup_crawler()
289 |     # 3
290 |     await setup_background()
291 |     from . import __version__
292 |     Config.logger.info(
293 |         f'App started, the current version is {__version__}, CONFIG_DIR: {Config.CONFIG_DIR}'
294 |     )
295 | 
296 | 
297 | async def release_app(app):
298 |     Config.is_shutdown = True
299 |     if Config.background_task and not Config.background_task.done():
300 |         Config.background_task.cancel()
301 |     if Config.db:
302 |         await Config.db.disconnect()
303 | 
304 | 
305 | async def default_db_backup_sqlite():
306 |     current_time = datetime.now().strftime('%Y%m%d%H%M%S')
307 |     for storage_path in Config.CONFIG_DIR.iterdir():
308 |         if storage_path.name == 'storage.sqlite':
309 |             import shutil
310 |             from pathlib import Path
311 |             backup_dir: Path = ensure_dir(Config.CONFIG_DIR / 'backups')
312 |             backup_path = backup_dir / f'storage-{current_time}.sqlite'
313 |             # 3.6 has no get_running_loop
314 |             loop = get_event_loop()
315 |             # wait for copy
316 |             future = loop.run_in_executor(None, shutil.copy, str(storage_path),
317 |                                           str(backup_path))
318 |             await future
319 |             # remove overdue files
320 |             backup_file_paths = sorted([i for i in backup_dir.iterdir()],
321 |                                        key=lambda path: path.name,
322 |                                        reverse=True)
323 |             path_to_del = backup_file_paths[Config.db_backup_count:]
324 |             for p in path_to_del:
325 |                 p.unlink()
326 | 
327 | 
328 | def get_host_freq_list(host):
329 |     freq = Uniparser._HOST_FREQUENCIES.get(host, None)
330 |     if freq:
331 |         return [freq.n, freq.interval]
332 |     else:
333 |         return [None, 0]
334 | 
335 | 
336 | async def set_host_freq(host, n, interval):
337 |     if n:
338 |         Uniparser._HOST_FREQUENCIES[host] = AsyncFrequency(n, interval)
339 |     else:
340 |         Uniparser._HOST_FREQUENCIES.pop(host, None)
341 |     await save_host_freqs()
342 | 
343 | 
344 | async def save_host_freqs():
345 |     items = {
346 |         host: freq.to_list()
347 |         for host, freq in Uniparser._HOST_FREQUENCIES.items()
348 |     }
349 |     await Config.metas.set('host_freqs', dumps(items))
350 | 
351 | 
352 | async def load_host_freqs():
353 |     host_freqs_str = await Config.metas.get('host_freqs', default='{}')
354 |     host_freqs = loads(host_freqs_str)
355 |     Uniparser._HOST_FREQUENCIES = {
356 |         host: AsyncFrequency(*args) for host, args in host_freqs.items()
357 |     }
358 | 


--------------------------------------------------------------------------------
/watchdogs/static/js/vue-resource.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 |  * vue-resource v1.5.1
3 |  * https://github.com/pagekit/vue-resource
4 |  * Released under the MIT License.
5 |  */
6 | 
7 | !function(t,e){"object"==typeof exports&&"undefined"!=typeof module?module.exports=e():"function"==typeof define&&define.amd?define(e):t.VueResource=e()}(this,function(){"use strict";function u(t){this.state=2,this.value=void 0,this.deferred=[];var e=this;try{t(function(t){e.resolve(t)},function(t){e.reject(t)})}catch(t){e.reject(t)}}u.reject=function(n){return new u(function(t,e){e(n)})},u.resolve=function(n){return new u(function(t,e){t(n)})},u.all=function(s){return new u(function(n,t){var o=0,r=[];function e(e){return function(t){r[e]=t,(o+=1)===s.length&&n(r)}}0===s.length&&n(r);for(var i=0;i<s.length;i+=1)u.resolve(s[i]).then(e(i),t)})},u.race=function(o){return new u(function(t,e){for(var n=0;n<o.length;n+=1)u.resolve(o[n]).then(t,e)})};var t=u.prototype;function c(t,e){t instanceof Promise?this.promise=t:this.promise=new Promise(t.bind(e)),this.context=e}t.resolve=function(t){var e=this;if(2===e.state){if(t===e)throw new TypeError("Promise settled with itself.");var n=!1;try{var o=t&&t.then;if(null!==t&&"object"==typeof t&&"function"==typeof o)return void o.call(t,function(t){n||e.resolve(t),n=!0},function(t){n||e.reject(t),n=!0})}catch(t){return void(n||e.reject(t))}e.state=0,e.value=t,e.notify()}},t.reject=function(t){var e=this;if(2===e.state){if(t===e)throw new TypeError("Promise settled with itself.");e.state=1,e.value=t,e.notify()}},t.notify=function(){var t,i=this;r(function(){if(2!==i.state)for(;i.deferred.length;){var t=i.deferred.shift(),e=t[0],n=t[1],o=t[2],r=t[3];try{0===i.state?o("function"==typeof e?e.call(void 0,i.value):i.value):1===i.state&&("function"==typeof n?o(n.call(void 0,i.value)):r(i.value))}catch(t){r(t)}}},t)},t.then=function(n,o){var r=this;return new u(function(t,e){r.deferred.push([n,o,t,e]),r.notify()})},t.catch=function(t){return this.then(void 0,t)},"undefined"==typeof Promise&&(window.Promise=u),c.all=function(t,e){return new c(Promise.all(t),e)},c.resolve=function(t,e){return new c(Promise.resolve(t),e)},c.reject=function(t,e){return new c(Promise.reject(t),e)},c.race=function(t,e){return new c(Promise.race(t),e)};var e=c.prototype;e.bind=function(t){return this.context=t,this},e.then=function(t,e){return t&&t.bind&&this.context&&(t=t.bind(this.context)),e&&e.bind&&this.context&&(e=e.bind(this.context)),new c(this.promise.then(t,e),this.context)},e.catch=function(t){return t&&t.bind&&this.context&&(t=t.bind(this.context)),new c(this.promise.catch(t),this.context)},e.finally=function(e){return this.then(function(t){return e.call(this),t},function(t){return e.call(this),Promise.reject(t)})};var r,i={}.hasOwnProperty,o=[].slice,a=!1,s="undefined"!=typeof window;function f(t){return t?t.replace(/^\s*|\s*$/g,""):""}function p(t){return t?t.toLowerCase():""}var h=Array.isArray;function d(t){return"string"==typeof t}function l(t){return"function"==typeof t}function m(t){return null!==t&&"object"==typeof t}function y(t){return m(t)&&Object.getPrototypeOf(t)==Object.prototype}function v(t,e,n){var o=c.resolve(t);return arguments.length<2?o:o.then(e,n)}function b(t,e,n){return l(n=n||{})&&(n=n.call(e)),T(t.bind({$vm:e,$options:n}),t,{$options:n})}function g(t,e){var n,o;if(h(t))for(n=0;n<t.length;n++)e.call(t[n],t[n],n);else if(m(t))for(o in t)i.call(t,o)&&e.call(t[o],t[o],o);return t}var w=Object.assign||function(e){return o.call(arguments,1).forEach(function(t){x(e,t)}),e};function T(e){return o.call(arguments,1).forEach(function(t){x(e,t,!0)}),e}function x(t,e,n){for(var o in e)n&&(y(e[o])||h(e[o]))?(y(e[o])&&!y(t[o])&&(t[o]={}),h(e[o])&&!h(t[o])&&(t[o]=[]),x(t[o],e[o],n)):void 0!==e[o]&&(t[o]=e[o])}function j(t,e,n){var o,u,a,r=(o=t,u=["+","#",".","/",";","?","&"],{vars:a=[],expand:function(s){return o.replace(/\{([^{}]+)\}|([^{}]+)/g,function(t,e,n){if(e){var o=null,r=[];if(-1!==u.indexOf(e.charAt(0))&&(o=e.charAt(0),e=e.substr(1)),e.split(/,/g).forEach(function(t){var e=/([^:*]*)(?::(\d+)|(\*))?/.exec(t);r.push.apply(r,function(t,e,n,o){var r=t[n],i=[];if(E(r)&&""!==r)if("string"==typeof r||"number"==typeof r||"boolean"==typeof r)r=r.toString(),o&&"*"!==o&&(r=r.substring(0,parseInt(o,10))),i.push(O(e,r,P(e)?n:null));else if("*"===o)Array.isArray(r)?r.filter(E).forEach(function(t){i.push(O(e,t,P(e)?n:null))}):Object.keys(r).forEach(function(t){E(r[t])&&i.push(O(e,r[t],t))});else{var s=[];Array.isArray(r)?r.filter(E).forEach(function(t){s.push(O(e,t))}):Object.keys(r).forEach(function(t){E(r[t])&&(s.push(encodeURIComponent(t)),s.push(O(e,r[t].toString())))}),P(e)?i.push(encodeURIComponent(n)+"="+s.join(",")):0!==s.length&&i.push(s.join(","))}else";"===e?i.push(encodeURIComponent(n)):""!==r||"&"!==e&&"?"!==e?""===r&&i.push(""):i.push(encodeURIComponent(n)+"=");return i}(s,o,e[1],e[2]||e[3])),a.push(e[1])}),o&&"+"!==o){var i=",";return"?"===o?i="&":"#"!==o&&(i=o),(0!==r.length?o:"")+r.join(i)}return r.join(",")}return C(n)})}}),i=r.expand(e);return n&&n.push.apply(n,r.vars),i}function E(t){return null!=t}function P(t){return";"===t||"&"===t||"?"===t}function O(t,e,n){return e="+"===t||"#"===t?C(e):encodeURIComponent(e),n?encodeURIComponent(n)+"="+e:e}function C(t){return t.split(/(%[0-9A-Fa-f]{2})/g).map(function(t){return/%[0-9A-Fa-f]/.test(t)||(t=encodeURI(t)),t}).join("")}function $(t,e){var r,i=this||{},n=t;return d(t)&&(n={url:t,params:e}),n=T({},$.options,i.$options,n),$.transforms.forEach(function(t){var e,n,o;d(t)&&(t=$.transform[t]),l(t)&&(e=t,n=r,o=i.$vm,r=function(t){return e.call(o,t,n)})}),r(n)}function U(i){return new c(function(o){var r=new XDomainRequest,t=function(t){var e=t.type,n=0;"load"===e?n=200:"error"===e&&(n=500),o(i.respondWith(r.responseText,{status:n}))};i.abort=function(){return r.abort()},r.open(i.method,i.getUrl()),i.timeout&&(r.timeout=i.timeout),r.onload=t,r.onabort=t,r.onerror=t,r.ontimeout=t,r.onprogress=function(){},r.send(i.getBody())})}$.options={url:"",root:null,params:{}},$.transform={template:function(e){var t=[],n=j(e.url,e.params,t);return t.forEach(function(t){delete e.params[t]}),n},query:function(t,e){var n=Object.keys($.options.params),o={},r=e(t);return g(t.params,function(t,e){-1===n.indexOf(e)&&(o[e]=t)}),(o=$.params(o))&&(r+=(-1==r.indexOf("?")?"?":"&")+o),r},root:function(t,e){var n,o,r=e(t);return d(t.root)&&!/^(https?:)?\//.test(r)&&(n=t.root,o="/",r=(n&&void 0===o?n.replace(/\s+$/,""):n&&o?n.replace(new RegExp("["+o+"]+$"),""):n)+"/"+r),r}},$.transforms=["template","query","root"],$.params=function(t){var e=[],n=encodeURIComponent;return e.add=function(t,e){l(e)&&(e=e()),null===e&&(e=""),this.push(n(t)+"="+n(e))},function n(o,t,r){var i,s=h(t),u=y(t);g(t,function(t,e){i=m(t)||h(t),r&&(e=r+"["+(u||i?e:"")+"]"),!r&&s?o.add(t.name,t.value):i?n(o,t,e):o.add(e,t)})}(e,t),e.join("&").replace(/%20/g,"+")},$.parse=function(t){var e=document.createElement("a");return document.documentMode&&(e.href=t,t=e.href),e.href=t,{href:e.href,protocol:e.protocol?e.protocol.replace(/:$/,""):"",port:e.port,host:e.host,hostname:e.hostname,pathname:"/"===e.pathname.charAt(0)?e.pathname:"/"+e.pathname,search:e.search?e.search.replace(/^\?/,""):"",hash:e.hash?e.hash.replace(/^#/,""):""}};var R=s&&"withCredentials"in new XMLHttpRequest;function n(u){return new c(function(o){var t,r,e=u.jsonp||"callback",i=u.jsonpCallback||"_jsonp"+Math.random().toString(36).substr(2),s=null;t=function(t){var e=t.type,n=0;"load"===e&&null!==s?n=200:"error"===e&&(n=500),n&&window[i]&&(delete window[i],document.body.removeChild(r)),o(u.respondWith(s,{status:n}))},window[i]=function(t){s=JSON.stringify(t)},u.abort=function(){t({type:"abort"})},u.params[e]=i,u.timeout&&setTimeout(u.abort,u.timeout),(r=document.createElement("script")).src=u.getUrl(),r.type="text/javascript",r.async=!0,r.onload=t,r.onerror=t,document.body.appendChild(r)})}function A(r){return new c(function(n){var o=new XMLHttpRequest,t=function(t){var e=r.respondWith("response"in o?o.response:o.responseText,{status:1223===o.status?204:o.status,statusText:1223===o.status?"No Content":f(o.statusText)});g(f(o.getAllResponseHeaders()).split("\n"),function(t){e.headers.append(t.slice(0,t.indexOf(":")),t.slice(t.indexOf(":")+1))}),n(e)};r.abort=function(){return o.abort()},o.open(r.method,r.getUrl(),!0),r.timeout&&(o.timeout=r.timeout),r.responseType&&"responseType"in o&&(o.responseType=r.responseType),(r.withCredentials||r.credentials)&&(o.withCredentials=!0),r.crossOrigin||r.headers.set("X-Requested-With","XMLHttpRequest"),l(r.progress)&&"GET"===r.method&&o.addEventListener("progress",r.progress),l(r.downloadProgress)&&o.addEventListener("progress",r.downloadProgress),l(r.progress)&&/^(POST|PUT)$/i.test(r.method)&&o.upload.addEventListener("progress",r.progress),l(r.uploadProgress)&&o.upload&&o.upload.addEventListener("progress",r.uploadProgress),r.headers.forEach(function(t,e){o.setRequestHeader(e,t)}),o.onload=t,o.onabort=t,o.onerror=t,o.ontimeout=t,o.send(r.getBody())})}function S(s){var u=require("got");return new c(function(e){var n,t=s.getUrl(),o=s.getBody(),r=s.method,i={};s.headers.forEach(function(t,e){i[e]=t}),u(t,{body:o,method:r,headers:i}).then(n=function(t){var n=s.respondWith(t.body,{status:t.statusCode,statusText:f(t.statusMessage)});g(t.headers,function(t,e){n.headers.set(e,t)}),e(n)},function(t){return n(t.response)})})}function k(t){return(t.client||(s?A:S))(t)}var I=function(t){var n=this;this.map={},g(t,function(t,e){return n.append(e,t)})};function L(t,n){return Object.keys(t).reduce(function(t,e){return p(n)===p(e)?e:t},null)}I.prototype.has=function(t){return null!==L(this.map,t)},I.prototype.get=function(t){var e=this.map[L(this.map,t)];return e?e.join():null},I.prototype.getAll=function(t){return this.map[L(this.map,t)]||[]},I.prototype.set=function(t,e){this.map[function(t){if(/[^a-z0-9\-#$%&'*+.^_`|~]/i.test(t))throw new TypeError("Invalid character in header field name");return f(t)}(L(this.map,t)||t)]=[f(e)]},I.prototype.append=function(t,e){var n=this.map[L(this.map,t)];n?n.push(f(e)):this.set(t,e)},I.prototype.delete=function(t){delete this.map[L(this.map,t)]},I.prototype.deleteAll=function(){this.map={}},I.prototype.forEach=function(n,o){var r=this;g(this.map,function(t,e){g(t,function(t){return n.call(o,t,e,r)})})};var q=function(t,e){var n,o,r,i=e.url,s=e.headers,u=e.status,a=e.statusText;this.url=i,this.ok=200<=u&&u<300,this.status=u||0,this.statusText=a||"",this.headers=new I(s),d(this.body=t)?this.bodyText=t:(r=t,"undefined"!=typeof Blob&&r instanceof Blob&&(this.bodyBlob=t,(0===(o=t).type.indexOf("text")||-1!==o.type.indexOf("json"))&&(this.bodyText=(n=t,new c(function(t){var e=new FileReader;e.readAsText(n),e.onload=function(){t(e.result)}})))))};q.prototype.blob=function(){return v(this.bodyBlob)},q.prototype.text=function(){return v(this.bodyText)},q.prototype.json=function(){return v(this.text(),function(t){return JSON.parse(t)})},Object.defineProperty(q.prototype,"data",{get:function(){return this.body},set:function(t){this.body=t}});var H=function(t){var e;this.body=null,this.params={},w(this,t,{method:(e=t.method||"GET",e?e.toUpperCase():"")}),this.headers instanceof I||(this.headers=new I(this.headers))};H.prototype.getUrl=function(){return $(this)},H.prototype.getBody=function(){return this.body},H.prototype.respondWith=function(t,e){return new q(t,w(e||{},{url:this.getUrl()}))};var B={"Content-Type":"application/json;charset=utf-8"};function M(t){var e=this||{},n=function(i){var s=[k],u=[];function t(t){for(;s.length;){var e=s.pop();if(l(e)){var o=void 0,n=void 0;if(m(o=e.call(i,t,function(t){return n=t})||n))return new c(function(t,n){u.forEach(function(e){o=v(o,function(t){return e.call(i,t)||t},n)}),v(o,t,n)},i);l(o)&&u.unshift(o)}else r="Invalid interceptor of type "+typeof e+", must be a function","undefined"!=typeof console&&a&&console.warn("[VueResource warn]: "+r)}var r}return m(i)||(i=null),t.use=function(t){s.push(t)},t}(e.$vm);return function(n){o.call(arguments,1).forEach(function(t){for(var e in t)void 0===n[e]&&(n[e]=t[e])})}(t||{},e.$options,M.options),M.interceptors.forEach(function(t){d(t)&&(t=M.interceptor[t]),l(t)&&n.use(t)}),n(new H(t)).then(function(t){return t.ok?t:c.reject(t)},function(t){var e;return t instanceof Error&&(e=t,"undefined"!=typeof console&&console.error(e)),c.reject(t)})}function N(n,o,t,r){var i=this||{},s={};return g(t=w({},N.actions,t),function(t,e){t=T({url:n,params:w({},o)},r,t),s[e]=function(){return(i.$http||M)(function(t,e){var n,o=w({},t),r={};switch(e.length){case 2:r=e[0],n=e[1];break;case 1:/^(POST|PUT|PATCH)$/i.test(o.method)?n=e[0]:r=e[0];break;case 0:break;default:throw"Expected up to 2 arguments [params, body], got "+e.length+" arguments"}return o.body=n,o.params=w({},o.params,r),o}(t,arguments))}}),s}function D(n){var t,e,o;D.installed||(e=(t=n).config,o=t.nextTick,r=o,a=e.debug||!e.silent,n.url=$,n.http=M,n.resource=N,n.Promise=c,Object.defineProperties(n.prototype,{$url:{get:function(){return b(n.url,this,this.$options.url)}},$http:{get:function(){return b(n.http,this,this.$options.http)}},$resource:{get:function(){return n.resource.bind(this)}},$promise:{get:function(){var e=this;return function(t){return new n.Promise(t,e)}}}}))}return M.options={},M.headers={put:B,post:B,patch:B,delete:B,common:{Accept:"application/json, text/plain, */*"},custom:{}},M.interceptor={before:function(t){l(t.before)&&t.before.call(this,t)},method:function(t){t.emulateHTTP&&/^(PUT|PATCH|DELETE)$/i.test(t.method)&&(t.headers.set("X-HTTP-Method-Override",t.method),t.method="POST")},jsonp:function(t){"JSONP"==t.method&&(t.client=n)},json:function(t){var e=t.headers.get("Content-Type")||"";return m(t.body)&&0===e.indexOf("application/json")&&(t.body=JSON.stringify(t.body)),function(o){return o.bodyText?v(o.text(),function(t){var e,n;if(0===(o.headers.get("Content-Type")||"").indexOf("application/json")||(n=(e=t).match(/^\s*(\[|\{)/))&&{"[":/]\s*$/,"{":/}\s*$/}[n[1]].test(e))try{o.body=JSON.parse(t)}catch(t){o.body=null}else o.body=t;return o}):o}},form:function(t){var e;e=t.body,"undefined"!=typeof FormData&&e instanceof FormData?t.headers.delete("Content-Type"):m(t.body)&&t.emulateJSON&&(t.body=$.params(t.body),t.headers.set("Content-Type","application/x-www-form-urlencoded"))},header:function(n){g(w({},M.headers.common,n.crossOrigin?{}:M.headers.custom,M.headers[p(n.method)]),function(t,e){n.headers.has(e)||n.headers.set(e,t)})},cors:function(t){if(s){var e=$.parse(location.href),n=$.parse(t.getUrl());n.protocol===e.protocol&&n.host===e.host||(t.crossOrigin=!0,t.emulateHTTP=!1,R||(t.client=U))}}},M.interceptors=["before","method","jsonp","json","form","header","cors"],["get","delete","head","jsonp"].forEach(function(n){M[n]=function(t,e){return this(w(e||{},{url:t,method:n}))}}),["post","put","patch"].forEach(function(o){M[o]=function(t,e,n){return this(w(n||{},{url:t,method:o,body:e}))}}),N.actions={get:{method:"GET"},save:{method:"POST"},query:{method:"GET"},update:{method:"PUT"},remove:{method:"DELETE"},delete:{method:"DELETE"}},"undefined"!=typeof window&&window.Vue&&window.Vue.use(D),D});


--------------------------------------------------------------------------------
/watchdogs/crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from asyncio import ensure_future, wait
  4 | from datetime import datetime, timedelta
  5 | from json import JSONDecodeError, dumps, loads
  6 | from traceback import format_exc
  7 | from typing import Optional, Tuple
  8 | 
  9 | from torequests.utils import timeago, ttime
 10 | from uniparser import Crawler, RuleNotFoundError
 11 | 
 12 | from .config import Config
 13 | from .models import Database, Task, query_feeds, query_tasks, tasks
 14 | from .utils import (check_work_time, get_result_key, get_watchdog_result, solo,
 15 |                     try_catch)
 16 | 
 17 | 
 18 | class UpdateTaskQuery:
 19 |     __slots__ = ('_query', 'values')
 20 | 
 21 |     def __init__(self, task_id):
 22 |         self._query = []
 23 |         self.values = {'task_id': task_id}
 24 | 
 25 |     def add(self, key, value):
 26 |         self._query.append(f'`{key}`=:{key}')
 27 |         self.values[key] = value
 28 | 
 29 |     @property
 30 |     def set_values(self):
 31 |         if self._query:
 32 |             return f'set {", ".join(self._query)}'
 33 |         else:
 34 |             return ''
 35 | 
 36 |     @property
 37 |     def kwargs(self):
 38 |         return {
 39 |             'query': f'update tasks {self.set_values} where `task_id`=:task_id',
 40 |             'values': self.values
 41 |         }
 42 | 
 43 | 
 44 | def find_next_check_time(
 45 |     task: Task,
 46 |     now: Optional[datetime] = None,
 47 | ) -> Tuple[bool, datetime]:
 48 |     '''
 49 | Three kinds of format:
 50 | 
 51 |         1. Tow numbers splited by ', ', as work_hours:
 52 |             0, 24           means from 00:00 ~ 23:59, for everyday
 53 |         2. JSON list of int, as work_hours:
 54 |             [1, 19]         means 01:00~01:59 a.m.  07:00~07:59 p.m. for everyday
 55 |         3. Standard strftime format, as work_days:
 56 |             > Split work_hours by '==', then check
 57 |                 if datetime.now().strftime(wh[0]) == wh[1]
 58 |             %A==Friday      means each Friday
 59 |             %m-%d==03-13    means every year 03-13
 60 |             %H==05          means everyday morning 05:00 ~ 05:59
 61 |         4. Mix up work_days and work_hours:
 62 |             > Split work_days and work_hours with ';'/'&' => 'and', '|' => 'or'.
 63 |             > Support == for equal, != for unequal.
 64 |             %w==5;20, 24        means every Friday 20:00 ~ 23:59
 65 |             [1, 2, 15];%w==5    means every Friday 1 a.m. 2 a.m. 3 p.m., the work_hours is on the left side.
 66 |             %w==5|20, 24        means every Friday or everyday 20:00 ~ 23:59
 67 |             %w==5|%w==2         means every Friday or Tuesday
 68 |             %w!=6&%w!=0         means everyday except Saturday & Sunday.
 69 |         5. Set a ensure change interval
 70 |             > If work_hours string endswith `#` and `x` seconds, will check the next_change_time first.
 71 |             > In other words, I am very sure that the interval between two changes is more than `x` seconds
 72 |             > So the crawler of this task will not run until the time is `last_change_time + change_interval`
 73 |             %w==5#86400        means every Friday if it didn't change within 1 day
 74 |             0, 24#3600         means each hour if it didn't change within this hour. The task will only be crawled once if it has changed.
 75 |     '''
 76 |     # find the latest hour fit work_hours, if not exist, return next day 00:00
 77 |     now = now or datetime.now()
 78 |     work_hours = task.work_hours or '0, 24'
 79 |     if '#' in work_hours:
 80 |         # check if changed
 81 |         last_change_time = task.last_change_time or datetime.fromtimestamp(0)
 82 |         # split work_hours and change_interval
 83 |         work_hours, change_interval_str = work_hours.split('#')
 84 |         change_interval = int(change_interval_str)
 85 |         # not fit change interval, will wait for left seconds.
 86 |         next_change_time = last_change_time + timedelta(seconds=change_interval)
 87 |         if now < next_change_time:
 88 |             Config.logger.info(
 89 |                 f'Task [{task.name}] has changed in {timeago(change_interval, accuracy=1, format=1, short_name=1)} ago.'
 90 |             )
 91 |             return False, next_change_time
 92 | 
 93 |     need_crawl = check_work_time(work_hours, now)
 94 |     if need_crawl:
 95 |         # current time is need_crawl, next_check_time is now+interval
 96 |         next_check_time = now + timedelta(seconds=task.interval)
 97 |         return need_crawl, next_check_time
 98 |     else:
 99 |         # current time is not need_crawl
100 |         next_check_time = now
101 |         # time machine to update next_check_time fast
102 |         for _ in range(60):
103 |             # next interval
104 |             next_check_time = next_check_time + timedelta(seconds=task.interval)
105 |             _need_crawl = check_work_time(work_hours, next_check_time)
106 |             if _need_crawl:
107 |                 # current time is still False, but next_check_time is True
108 |                 break
109 |         return need_crawl, next_check_time
110 | 
111 | 
112 | async def crawl(task: Task):
113 |     crawler: Crawler = Config.crawler
114 |     logger = Config.logger
115 |     logger.info(f'Start crawling: {task.name}')
116 |     crawl_result = await try_catch(crawler.acrawl, task.request_args)
117 |     error = ''
118 |     if isinstance(crawl_result, RuleNotFoundError):
119 |         error = repr(crawl_result)
120 |         logger.error(f'{task.name}: {error}')
121 |         result_list = [{"text": error}]
122 |     elif isinstance(crawl_result, BaseException):
123 |         error = getattr(crawl_result, 'text', repr(crawl_result))
124 |         logger.error(f'{task.name}: {error}')
125 |         result_list = None
126 |     else:
127 |         if len(crawl_result) == 1:
128 |             # crawl_result schema: {rule_name: list_or_dict}
129 |             formated_result = get_watchdog_result(
130 |                 item=crawl_result.popitem()[1])
131 |             if formated_result == {'text': 'text not found'}:
132 |                 error = f'{task.name} text not found, crawl result given: {crawl_result}'
133 |                 logger.error(error)
134 |                 result_list = None
135 |             else:
136 |                 if isinstance(formated_result, list):
137 |                     result_list = formated_result
138 |                 else:
139 |                     result_list = [formated_result]
140 |                 # use force crawl one web UI for more log
141 |                 logger.info(f'{task.name} Crawl success: {result_list}'[:150])
142 |         else:
143 |             error = 'Invalid crawl_result against schema {rule_name: [{"text": "Required", "url": "Optional", "key": "Optional", "unique": "Optional"}]}, given is %r' % crawl_result
144 |             logger.error(f'{task.name}: {error}')
145 |             result_list = [{"text": error}]
146 |     return task, error, result_list
147 | 
148 | 
149 | async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
150 |     """task_name means force crawl"""
151 |     db: Database = Config.db
152 |     now = datetime.now()
153 |     logger = Config.logger
154 |     logger.info(f'crawl_once task_name={task_name} start.')
155 |     # sqlite do not has datediff...
156 |     if task_name:
157 |         query = tasks.select().where(tasks.c.name == task_name)
158 |     else:
159 |         query = tasks.select().where(tasks.c.enable == 1).where(
160 |             tasks.c.next_check_time <= now)
161 |         query = query.limit(chunk_size)
162 |     todo = []
163 |     now = datetime.now()
164 |     update_values = []
165 |     CLEAR_CACHE_NEEDED = False
166 |     fetched_tasks = await db.fetch_all(query=query)
167 |     has_more = len(fetched_tasks) >= chunk_size
168 |     for _task in fetched_tasks:
169 |         task = Task(**dict(_task))
170 |         # check work hours
171 |         need_crawl, next_check_time = find_next_check_time(task, now)
172 |         if task_name:
173 |             # always crawl for given task_name
174 |             need_crawl = True
175 |         if need_crawl:
176 |             t = ensure_future(crawl(task))
177 |             # add task_name for logger
178 |             setattr(t, 'task_name', task.name)
179 |             todo.append(t)
180 |         # update next_check_time
181 |         values = {
182 |             'last_check_time': now,
183 |             'next_check_time': next_check_time,
184 |             'task_id': task.task_id
185 |         }
186 |         # update task variable for callback
187 |         task.__dict__.update(values)
188 |         update_values.append(values)
189 |         if not need_crawl:
190 |             logger.info(
191 |                 f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}'
192 |             )
193 |     update_query = 'update tasks set `last_check_time`=:last_check_time,`next_check_time`=:next_check_time where task_id=:task_id'
194 |     await db.execute_many(query=update_query, values=update_values)
195 |     if update_values:
196 |         CLEAR_CACHE_NEEDED = True
197 |     logger.info(f'crawl_once crawling {len(todo)} valid tasks.')
198 |     if todo:
199 |         crawl_errors = []
200 |         done, pending = await wait(todo, timeout=Config.default_crawler_timeout)
201 |         if pending:
202 |             names = [getattr(t, 'task_name', None) for t in pending]
203 |             logger.error(f'crawl timeout {len(names)}: {names}')
204 |             for _pending in pending:
205 |                 crawl_errors.append({
206 |                     'task_id': task.task_id,
207 |                     'error': 'timeout(%s)' % Config.default_crawler_timeout
208 |                 })
209 |                 _pending.cancel()
210 |         ttime_now = ttime()
211 |         changed_tasks = []
212 |         update_counts = 0
213 |         for t in done:
214 |             task, error, result_list = t.result()
215 |             if error != task.error:
216 |                 crawl_errors.append({'task_id': task.task_id, 'error': error})
217 |             if error or result_list is None:
218 |                 # ignore update this task
219 |                 continue
220 |             # compare latest_result and new list
221 |             # later first, just like the saved result_list sortings
222 |             old_latest_result = loads(task.latest_result)
223 |             # try to use the key, or it self
224 |             try:
225 |                 old_result_list = loads(
226 |                     task.result_list) if task.result_list else []
227 |             except JSONDecodeError:
228 |                 old_result_list = []
229 |             if old_latest_result.get('unique', True):
230 |                 # unique mode will skip all the Duplicated results
231 |                 exist_keys = {
232 |                     get_result_key(_old_result['result'])
233 |                     for _old_result in old_result_list
234 |                 }
235 |             else:
236 |                 old_latest_result_key = get_result_key(old_latest_result)
237 |                 exist_keys = {old_latest_result_key}
238 |             # list of dict
239 |             to_insert_result_list = []
240 |             for result in result_list:
241 |                 result_key = get_result_key(result)
242 |                 if result_key in exist_keys:
243 |                     break
244 |                 to_insert_result_list.append(result)
245 |             if to_insert_result_list:
246 |                 # update db
247 |                 update_counts += 1
248 |                 # new result updated
249 |                 query = UpdateTaskQuery(task.task_id)
250 |                 # JSON
251 |                 new_latest_result = dumps(to_insert_result_list[0],
252 |                                           sort_keys=True)
253 |                 query.add('latest_result', new_latest_result)
254 |                 query.add('last_change_time', now)
255 |                 # older insert first, keep the newer is on the top
256 |                 new_seeds = []
257 |                 for result in to_insert_result_list[::-1]:
258 |                     # result is dict, not json string
259 |                     old_result_list.insert(0, {
260 |                         'result': result,
261 |                         'time': ttime_now
262 |                     })
263 |                     new_seeds.append(result)
264 |                 await save_feed(new_seeds, db, task)
265 |                 new_result_list = dumps(old_result_list[:task.max_result_count])
266 |                 query.add('result_list', new_result_list)
267 |                 logger.info(f'[Updated] {task.name}. +++')
268 |                 await db.execute(**query.kwargs)
269 |                 task.latest_result = new_latest_result
270 |                 task.last_change_time = now
271 |                 task.result_list = new_result_list
272 |                 changed_tasks.append(task)
273 |         if crawl_errors:
274 |             update_query = 'update tasks set `error`=:error where task_id=:task_id'
275 |             await db.execute_many(query=update_query, values=crawl_errors)
276 |         logger.info(
277 |             f'Crawl task_name={task_name} finished. Crawled: {len(done)}, Error: {len(crawl_errors)}, Timeout: {len(pending)}, Update: {update_counts}.{" +++" if update_counts else ""}'
278 |         )
279 |         for task in changed_tasks:
280 |             ensure_future(try_catch(Config.callback_handler.callback, task))
281 |         query_feeds.cache_clear()
282 |     else:
283 |         logger.info(f'Crawl task_name={task_name} finished. 0 todo.')
284 |     if CLEAR_CACHE_NEEDED:
285 |         logger.info('Clear cache for crawling new tasks.')
286 |         query_tasks.cache_clear()
287 |     if task_name:
288 |         query = tasks.select().where(tasks.c.name == task_name)
289 |         _task = await db.fetch_one(query=query)
290 |         return dict(_task)
291 |     else:
292 |         return has_more
293 | 
294 | 
295 | async def crawl_once(task_name: Optional[str] = None):
296 |     if task_name is not None:
297 |         return await _crawl_once(task_name)
298 |     with solo:
299 |         result = await try_catch(_crawl_once, task_name)
300 |         return result
301 | 
302 | 
303 | async def save_feed(new_seeds, db, task):
304 |     if not new_seeds:
305 |         return
306 |     try:
307 |         values = []
308 |         for result in new_seeds:
309 |             value = {
310 |                 'task_id': task.task_id,
311 |                 'name': task.name,
312 |                 'text': result.get('title') or result.get('text') or '',
313 |                 'url': result.get('url') or task.origin_url,
314 |                 'ts_create': datetime.now(),
315 |             }
316 |             values.append(value)
317 | 
318 |         query = "INSERT INTO feeds (`task_id`, `name`, `text`, `url`, `ts_create`) values (:task_id, :name, :text, :url, :ts_create)"
319 |         result = await db.execute_many(query=query, values=values)
320 |         Config.logger.info(
321 |             f'Insert task seeds success({task.name}): ({len(values)})')
322 |         return result
323 |     except Exception:
324 |         Config.logger.error(
325 |             f'Inserting task seeds failed({task.name}): {format_exc()}')
326 | 


--------------------------------------------------------------------------------
/watchdogs/models.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from datetime import datetime
  3 | from traceback import format_exc
  4 | from typing import Iterable, List, Optional, Set, Tuple, Union
  5 | 
  6 | import sqlalchemy
  7 | from async_lru import alru_cache
  8 | from databases import Database
  9 | from pydantic import BaseModel
 10 | from sqlalchemy.sql import text
 11 | from uniparser import CrawlerRule, HostRule
 12 | from uniparser.crawler import RuleStorage, get_host
 13 | 
 14 | from .config import Config
 15 | 
 16 | if Config.COLLATION is None:
 17 |     if Config.db_url.startswith('sqlite'):
 18 |         Config.COLLATION = None
 19 |     else:
 20 |         Config.COLLATION = 'utf8_unicode_ci'
 21 | 
 22 | metadata = sqlalchemy.MetaData()
 23 | date0 = datetime.strptime('1970-01-01 08:00:00', '%Y-%m-%d %H:%M:%S')
 24 | # server_default works instead of default, issue: https://github.com/encode/databases/issues/72
 25 | tasks = sqlalchemy.Table(
 26 |     "tasks",
 27 |     metadata,
 28 |     sqlalchemy.Column('task_id',
 29 |                       sqlalchemy.Integer,
 30 |                       primary_key=True,
 31 |                       autoincrement=True),
 32 |     sqlalchemy.Column("name",
 33 |                       sqlalchemy.String(64, collation=Config.COLLATION),
 34 |                       nullable=False,
 35 |                       index=True,
 36 |                       unique=True),
 37 |     sqlalchemy.Column("enable",
 38 |                       sqlalchemy.Integer,
 39 |                       server_default=text('1'),
 40 |                       nullable=False),
 41 |     sqlalchemy.Column("tag",
 42 |                       sqlalchemy.String(128, collation=Config.COLLATION),
 43 |                       server_default="default",
 44 |                       nullable=False),
 45 |     sqlalchemy.Column("error", sqlalchemy.TEXT(collation=Config.COLLATION)),
 46 |     sqlalchemy.Column("request_args",
 47 |                       sqlalchemy.TEXT(collation=Config.COLLATION),
 48 |                       nullable=False),
 49 |     sqlalchemy.Column("origin_url",
 50 |                       sqlalchemy.String(1024),
 51 |                       nullable=False,
 52 |                       server_default=""),
 53 |     sqlalchemy.Column("interval",
 54 |                       sqlalchemy.Integer,
 55 |                       server_default=text('300'),
 56 |                       nullable=False),
 57 |     sqlalchemy.Column("work_hours",
 58 |                       sqlalchemy.String(32),
 59 |                       server_default='0, 24',
 60 |                       nullable=False),
 61 |     sqlalchemy.Column("max_result_count",
 62 |                       sqlalchemy.Integer,
 63 |                       server_default=text('10'),
 64 |                       nullable=False),
 65 |     sqlalchemy.Column("latest_result", sqlalchemy.TEXT),
 66 |     sqlalchemy.Column("result_list", sqlalchemy.TEXT),  # JSON list
 67 |     sqlalchemy.Column("last_check_time",
 68 |                       sqlalchemy.TIMESTAMP,
 69 |                       server_default="1970-01-01 08:00:00",
 70 |                       nullable=False),
 71 |     sqlalchemy.Column("next_check_time",
 72 |                       sqlalchemy.TIMESTAMP,
 73 |                       server_default="1970-01-01 08:00:00",
 74 |                       nullable=False),
 75 |     sqlalchemy.Column("last_change_time",
 76 |                       sqlalchemy.TIMESTAMP,
 77 |                       server_default="1970-01-01 08:00:00",
 78 |                       index=True,
 79 |                       nullable=False),
 80 |     sqlalchemy.Column("custom_info",
 81 |                       sqlalchemy.TEXT(collation=Config.COLLATION)),
 82 | )
 83 | host_rules = sqlalchemy.Table(
 84 |     "host_rules",
 85 |     metadata,
 86 |     sqlalchemy.Column('host', sqlalchemy.String(128), primary_key=True),
 87 |     sqlalchemy.Column('host_rule', sqlalchemy.TEXT),
 88 | )
 89 | metas = sqlalchemy.Table(
 90 |     "metas",
 91 |     metadata,
 92 |     sqlalchemy.Column('key',
 93 |                       sqlalchemy.String(64, collation=Config.COLLATION),
 94 |                       primary_key=True),
 95 |     sqlalchemy.Column('value', sqlalchemy.TEXT(collation=Config.COLLATION)),
 96 | )
 97 | feeds = sqlalchemy.Table(
 98 |     "feeds",
 99 |     metadata,
100 |     sqlalchemy.Column('id',
101 |                       sqlalchemy.Integer,
102 |                       primary_key=True,
103 |                       autoincrement=True),
104 |     sqlalchemy.Column('task_id', sqlalchemy.Integer, nullable=False),
105 |     sqlalchemy.Column("name",
106 |                       sqlalchemy.String(64, collation=Config.COLLATION),
107 |                       nullable=False),
108 |     # sqlalchemy.Column("tag",
109 |     #                   sqlalchemy.String(128, collation=Config.COLLATION),
110 |     #                   server_default="default",
111 |     #                   nullable=False),
112 |     sqlalchemy.Column("text", sqlalchemy.TEXT),
113 |     sqlalchemy.Column("url",
114 |                       sqlalchemy.String(1024),
115 |                       nullable=False,
116 |                       server_default=""),
117 |     sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False),
118 | )
119 | groups = sqlalchemy.Table(
120 |     "groups",
121 |     metadata,
122 |     sqlalchemy.Column('id',
123 |                       sqlalchemy.Integer,
124 |                       primary_key=True,
125 |                       autoincrement=True),
126 |     sqlalchemy.Column("name",
127 |                       sqlalchemy.String(64, collation=Config.COLLATION),
128 |                       nullable=False),
129 |     sqlalchemy.Column("task_ids", sqlalchemy.TEXT),
130 |     # sqlalchemy.Column("ts_create", sqlalchemy.TIMESTAMP, nullable=False),
131 | )
132 | 
133 | 
134 | def create_tables(db_url):
135 |     try:
136 |         engine = sqlalchemy.create_engine(db_url)
137 |         metadata.create_all(engine)
138 |     except BaseException:
139 |         Config.logger.critical(f'Fatal error on creating Table: {format_exc()}')
140 |         import os
141 |         os._exit(1)
142 | 
143 | 
144 | class RuleStorageDB(RuleStorage):
145 | 
146 |     def __init__(self, db):
147 |         self.db = db
148 |         self.logger = Config.logger
149 | 
150 |     async def commit(self):
151 |         pass
152 | 
153 |     async def get_host_rule(self, host: str, default=None):
154 |         query = "SELECT host_rule FROM host_rules WHERE host = :host"
155 |         host_rule = await self.db.fetch_one(query=query, values={"host": host})
156 |         if host_rule:
157 |             return HostRule.loads(host_rule[0])
158 |         else:
159 |             return default
160 | 
161 |     async def find_crawler_rule(self, url, method='find') -> CrawlerRule:
162 |         if not url:
163 |             return None
164 |         host = get_host(url)
165 |         host_rule = await self.get_host_rule(host)
166 |         if host_rule:
167 |             return host_rule.find(url)
168 | 
169 |     async def add_crawler_rule(self, rule: CrawlerRule, commit=None):
170 |         if isinstance(rule, str):
171 |             rule = CrawlerRule.loads(rule)
172 |         elif isinstance(rule, dict) and not isinstance(rule, CrawlerRule):
173 |             rule = CrawlerRule(**rule)
174 |         if not rule.get('regex'):
175 |             raise ValueError('regex should not be null')
176 |         url = rule.get('request_args', {}).get('url')
177 |         if not url:
178 |             self.logger.error(f'[Rule] {rule["name"]} not found url.')
179 |             return False
180 |         host = get_host(url)
181 |         if not host:
182 |             return False
183 |         exist_host_rule = await self.get_host_rule(host)
184 |         if exist_host_rule:
185 |             exist_host_rule.add_crawler_rule(rule)
186 |             query = "update host_rules set host_rule=:host_rule_string WHERE host = :host"
187 |             return await self.db.execute(
188 |                 query=query,
189 |                 values={
190 |                     'host_rule_string': exist_host_rule.dumps(),
191 |                     'host': host
192 |                 })
193 |         else:
194 |             host_rule = HostRule(host)
195 |             host_rule.add_crawler_rule(rule)
196 |             query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)"
197 |             return await self.db.execute(
198 |                 query=query,
199 |                 values={
200 |                     'host_rule_string': host_rule.dumps(),
201 |                     'host': host
202 |                 })
203 | 
204 |     async def pop_crawler_rule(self, rule: CrawlerRule, commit=False):
205 |         query = "SELECT host_rule FROM host_rules"
206 |         host = get_host(rule['request_args'].get('url'))
207 |         values = {}
208 |         if host:
209 |             query += ' WHERE host = :host'
210 |             values['host'] = host
211 |         rows = await self.db.fetch_all(query=query, values=values)
212 |         for row in rows:
213 |             host_rule = HostRule.loads(row.host_rule)
214 |             crawler_rule = host_rule.pop_crawler_rule(rule['name'])
215 |             if crawler_rule:
216 |                 # update host_rule
217 |                 await self.add_host_rule(host_rule)
218 |                 return crawler_rule
219 | 
220 |     async def add_host_rule(self, rule: HostRule, commit=None):
221 |         """insert or update HostRule"""
222 |         # some sql not support upsert: insert replace, replace into, on conflict
223 |         query = "SELECT host_rule FROM host_rules WHERE host = :host"
224 |         exist_host_rule = await self.get_host_rule(rule['host'])
225 |         if exist_host_rule:
226 |             query = "update host_rules set host_rule=:host_rule_string WHERE host = :host"
227 |             return await self.db.execute(query=query,
228 |                                          values={
229 |                                              'host_rule_string': rule.dumps(),
230 |                                              'host': rule['host']
231 |                                          })
232 |         else:
233 |             query = "INSERT INTO host_rules (host, host_rule) values (:host, :host_rule_string)"
234 |             return await self.db.execute(query=query,
235 |                                          values={
236 |                                              'host_rule_string': rule.dumps(),
237 |                                              'host': rule['host']
238 |                                          })
239 | 
240 |     async def pop_host_rule(self, host: str, commit=None):
241 |         exist_host_rule = await self.get_host_rule(host)
242 |         host_rule = HostRule.loads(exist_host_rule) if exist_host_rule else None
243 |         if host_rule:
244 |             query = "delete FROM host_rules WHERE host = :host"
245 |             await self.db.execute(query=query, values={'host': host})
246 |         return host_rule
247 | 
248 | 
249 | class Task(BaseModel):
250 |     task_id: Optional[int] = None
251 |     name: str
252 |     enable: int = 0
253 |     tag: str = 'default'
254 |     error: str = ''
255 |     request_args: str
256 |     origin_url: str = ''
257 |     interval: int = 300
258 |     work_hours: str = '0, 24'
259 |     max_result_count: int = 30
260 |     latest_result: str = '{}'
261 |     result_list = '[]'
262 |     last_check_time: datetime = date0
263 |     next_check_time: datetime = date0
264 |     last_change_time: datetime = date0
265 |     custom_info: str = ''
266 | 
267 | 
268 | class Group(BaseModel):
269 |     id: Optional[int] = None
270 |     name: str = ''
271 |     task_ids: str = ''
272 | 
273 | 
274 | class Feed(BaseModel):
275 |     task_id: int
276 |     name: str
277 |     text: str
278 |     url: str
279 |     ts_create: datetime
280 | 
281 | 
282 | class Metas(object):
283 |     """Save & Load some variables with db"""
284 | 
285 |     def __init__(self, db: Database):
286 |         self.db = db
287 | 
288 |     async def set(self, key, value):
289 |         query = 'replace into metas (`key`, `value`) values (:key, :value)'
290 |         await Config.db.execute(query, values={'key': key, 'value': value})
291 |         self.clear_cache()
292 |         if (await self.get(key)) == value:
293 |             return True
294 |         else:
295 |             return False
296 | 
297 |     async def remove(self, key):
298 |         query = 'delete from metas where `key`=:key'
299 |         await Config.db.execute(query, values={'key': key})
300 |         self.clear_cache()
301 |         if not (await self.get(key)):
302 |             return True
303 |         else:
304 |             return False
305 | 
306 |     @alru_cache(maxsize=Config.metas_cache_maxsize)
307 |     async def _get(self, key, default=None):
308 |         query = 'select `value` from metas where `key`=:key'
309 |         result = await self.db.fetch_one(query, values={'key': key})
310 |         if result:
311 |             return result.value
312 |         else:
313 |             return default
314 | 
315 |     async def get(self, key, default=None, cache=True):
316 |         if not cache:
317 |             self.clear_cache()
318 |         return await self._get(key, default=default)
319 | 
320 |     def clear_cache(self):
321 |         self._get.cache_clear()
322 | 
323 | 
324 | @alru_cache(maxsize=Config.query_tasks_cache_maxsize)
325 | async def query_tasks(
326 |     task_name: Optional[str] = None,
327 |     task_id: Optional[int] = None,
328 |     page: int = 1,
329 |     page_size: int = Config.default_page_size,
330 |     order_by: str = 'last_change_time',
331 |     sort: str = 'desc',
332 |     tag: str = '',
333 |     task_ids: Tuple[int] = None,
334 | ) -> Tuple[List[dict], bool]:
335 |     # task_ids arg type is tuple for cache hashing
336 |     offset = page_size * (page - 1)
337 |     query = tasks.select()
338 |     if task_ids:
339 |         query = query.where(tasks.c.task_id.in_(task_ids))
340 |     else:
341 |         if task_id is not None:
342 |             query = query.where(tasks.c.task_id == task_id)
343 |         if task_name is not None:
344 |             query = query.where(tasks.c.name == task_name)
345 |         if tag:
346 |             query = query.where(tasks.c.tag == tag)
347 |     if order_by and sort:
348 |         ob = getattr(tasks.c, order_by, None)
349 |         if ob is None:
350 |             raise ValueError(f'bad order_by {order_by}')
351 |         if sort.lower() == 'desc':
352 |             ob = sqlalchemy.desc(ob)
353 |         elif sort.lower() == 'asc':
354 |             ob = sqlalchemy.asc(ob)
355 |         else:
356 |             raise ValueError(
357 |                 f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')")
358 |         query = query.order_by(ob)
359 |     query = query.limit(page_size + 1).offset(offset)
360 |     _result = await Config.db.fetch_all(query=query)
361 |     has_more = len(_result) > page_size
362 |     result = [dict(i) for i in _result][:page_size]
363 |     query_string = str(query.compile(
364 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
365 |     Config.logger.info(
366 |         f'[Query] {len(result)} tasks (has_more={has_more}): {query_string}')
367 |     return result, has_more
368 | 
369 | 
370 | @alru_cache(maxsize=Config.query_task_ids_cache_maxsize)
371 | async def query_task_ids(task_name: Optional[str] = None,
372 |                          tag: str = '') -> List[int]:
373 |     query = tasks.select()
374 |     if task_name is not None:
375 |         query = query.where(tasks.c.name == task_name)
376 |     if tag:
377 |         query = query.where(tasks.c.tag == tag)
378 |     _result = await Config.db.fetch_all(query=query)
379 |     result = [dict(i)['task_id'] for i in _result]
380 |     query_string = str(query.compile(
381 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
382 |     Config.logger.info(f'[Query] {len(result)} task ids: {query_string}')
383 |     return result
384 | 
385 | 
386 | @alru_cache(maxsize=Config.query_group_task_ids_cache_maxsize)
387 | async def query_group_task_ids(
388 |     group_id: int = None,
389 |     group_ids: Union[str, Tuple[int]] = None,
390 | ) -> List[int]:
391 |     _group_ids: Set[int] = set()
392 |     if group_id:
393 |         _group_ids.add(int(group_id))
394 |     if group_ids:
395 |         if isinstance(group_ids, str):
396 |             for _group_id in re.findall(r'\d+', group_ids):
397 |                 _group_ids.add(int(_group_id))
398 |         elif isinstance(group_ids, tuple):
399 |             _group_ids.add(int(_group_id))
400 |     task_ids: Set[int] = set()
401 |     for _group_id in _group_ids:
402 |         query = groups.select()
403 |         query = query.where(groups.c.id == _group_id)
404 |         _result = await Config.db.fetch_one(query=query)
405 |         if _result:
406 |             task_ids_str = dict(_result).get('task_ids') or ''
407 |             for task_id in re.findall(r'\d+', task_ids_str):
408 |                 task_ids.add(int(task_id))
409 |     query_string = str(query.compile(
410 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
411 |     Config.logger.info(
412 |         f'[Query] {len(task_ids)} task_ids by group {group_id or group_ids}: {query_string}'
413 |     )
414 |     return list(task_ids)
415 | 
416 | 
417 | @alru_cache(maxsize=Config.query_feeds_cache_maxsize)
418 | async def query_feeds(
419 |     task_name: Optional[str] = None,
420 |     task_id: Optional[int] = None,
421 |     page: int = 1,
422 |     page_size: int = Config.default_page_size,
423 |     order_by: str = 'id',
424 |     sort: str = 'desc',
425 |     tag: str = '',
426 |     task_ids: Tuple[int] = None,
427 | ) -> Tuple[List[dict], bool]:
428 |     # task_ids arg type is tuple for cache hashing
429 |     offset = page_size * (page - 1)
430 |     query = feeds.select()
431 |     _task_ids: List[int] = []
432 |     if task_ids:
433 |         _task_ids.extend(task_ids)
434 |     if tag:
435 |         _task_ids += await query_task_ids(tag=tag)
436 |     if _task_ids:
437 |         query = query.where(feeds.c.task_id.in_(tuple(_task_ids)))
438 |     else:
439 |         if task_id is not None:
440 |             query = query.where(feeds.c.task_id == task_id)
441 |         if task_name is not None:
442 |             query = query.where(feeds.c.name == task_name)
443 |     if order_by and sort:
444 |         ob = getattr(feeds.c, order_by, None)
445 |         if ob is None:
446 |             raise ValueError(f'bad order_by {order_by}')
447 |         if sort.lower() == 'desc':
448 |             ob = sqlalchemy.desc(ob)
449 |         elif sort.lower() == 'asc':
450 |             ob = sqlalchemy.asc(ob)
451 |         else:
452 |             raise ValueError(
453 |                 f"bad sort arg {sort} not in ('desc', 'asc', 'DESC', 'ASC')")
454 |         query = query.order_by(ob)
455 |     query = query.limit(page_size + 1).offset(offset)
456 |     _result = await Config.db.fetch_all(query=query)
457 |     has_more = len(_result) > page_size
458 |     result = [dict(i) for i in _result][:page_size]
459 |     query_string = str(query.compile(
460 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
461 |     Config.logger.info(
462 |         f'[Query] {len(result)} feeds (has_more={has_more}): {query_string}')
463 |     return result, has_more
464 | 
465 | 
466 | @alru_cache(maxsize=Config.query_groups_cache_maxsize)
467 | async def query_all_groups() -> List[dict]:
468 |     query = groups.select()
469 |     rows = await Config.db.fetch_all(query=query)
470 |     result = []
471 |     for row in rows:
472 |         result.append(dict(row))
473 |     query_string = str(query.compile(
474 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
475 |     Config.logger.info(f'[Query] {len(result)} groups: {query_string}')
476 |     return result
477 | 
478 | 
479 | async def query_task_errors(tag: str = '',
480 |                             task_ids: Iterable = None) -> List[dict]:
481 |     query = tasks.select().with_only_columns(tasks.c.name, tasks.c.error)
482 |     if task_ids:
483 |         query = query.where(tasks.c.task_id.in_(tuple(task_ids)))
484 |     query = query.where(tasks.c.error != '')
485 |     query = query.where(tasks.c.enable == 1)
486 |     if tag:
487 |         query = query.where(tasks.c.tag == tag)
488 |     query = query.order_by(sqlalchemy.desc('last_change_time'))
489 |     _result: list = await Config.db.fetch_all(query=query)
490 |     result = [dict(task) for task in _result]
491 |     query_string = str(query.compile(
492 |         compile_kwargs={"literal_binds": True})).replace('\n', '')
493 |     Config.logger.info(f'[Query] {len(result)} task errors: {query_string}')
494 |     return result
495 | 


--------------------------------------------------------------------------------
/watchdogs/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="referrer" content="never" />
  7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  8 |     <link rel="shortcut icon" href="/static/img/favicon.svg" />
  9 |     <title>Watchdogs v{{version}}</title>
 10 |     <meta name="viewport" content="width=device-width, initial-scale=0.3" />
 11 |     <script src="{{cdn_urls['VUE_JS_CDN']}}"></script>
 12 |     <script src="{{cdn_urls['ELEMENT_JS_CDN']}}"></script>
 13 |     <script src="{{cdn_urls['VUE_RESOURCE_CDN']}}"></script>
 14 |     <!-- https://cssminifier.com/ -->
 15 |     <!-- <link rel="stylesheet" type="text/css" href="/static/css/watchdogs.css" /> -->
 16 |     <link rel="stylesheet" type="text/css" href="/static/css/watchdogs.min.css" />
 17 |     <link rel="stylesheet" type="text/css" href="{{cdn_urls['ELEMENT_CSS_CDN']}}" />
 18 | </head>
 19 | 
 20 | <body>
 21 |     <template id="init_vars">{{init_vars}}</template>
 22 |     <div id="app" class="full-screen" v-cloak>
 23 |         <template>
 24 |             <el-tabs @tab-click="handleClick" v-model="activeName" class="full-screen" stretch="true">
 25 |                 <el-tab-pane label="Tasks" name="tasks">
 26 |                     <template>
 27 |                         <el-table @row-dblclick="row_db_click" :row-class-name="check_error_task"
 28 |                             :default-sort="{prop: 'last_change_time', order: 'descending'}" :data="task_list"
 29 |                             style="width: 100%;" height="90%" @sort-change="sort_change">
 30 |                             <el-table-column prop="task_id" label="task_id" sortable="custom" width="100">
 31 |                             </el-table-column>
 32 |                             <el-table-column prop="name" sortable="custom" label="name" width="280">
 33 |                                 <template slot-scope="scope">
 34 |                                     <el-link :href="scope.row.origin_url" target="_blank" type="primary">
 35 |                                         ${scope.row.name}
 36 |                                     </el-link>
 37 |                                 </template>
 38 |                             </el-table-column>
 39 |                             <el-table-column sortable="custom" label="tag" width="80">
 40 |                                 <template slot-scope="scope">
 41 |                                     <el-link :href="'?tag='+scope.row.tag" target="_blank" type="primary">
 42 |                                         ${scope.row.tag}
 43 |                                     </el-link>
 44 |                                 </template>
 45 |                             </el-table-column>
 46 |                             <el-table-column label="request_args" width="180">
 47 |                                 <template slot-scope="scope">
 48 |                                     <el-popover trigger="hover" placement="bottom-start">
 49 |                                         <h3>
 50 |                                             ${(scope.request_args_dict =
 51 |                                             JSON.parse(scope.row.request_args))&&(scope._url = new
 52 |                                             URL(scope.request_args_dict.url)) &&
 53 |                                             scope._url.host }
 54 |                                         </h3>
 55 |                                         <pre
 56 |                                             class="request_args_pre"><code>${JSON.stringify(scope.request_args_dict, null, 2)}</code></pre>
 57 |                                         <el-button @click=" view_crawler_rule_by_req(scope.row.request_args)"
 58 |                                             size="mini" icon="el-icon-view"><b>Rule</b></el-button>
 59 |                                         <el-button @click="view_host_by_req(scope.row.request_args)" size="mini"
 60 |                                             icon="el-icon-view"><b>Host</b></el-button>
 61 |                                         <div slot="reference" class="name-wrapper">
 62 |                                             <el-tag size="medium">${ scope._url.host }</el-tag>
 63 |                                         </div>
 64 |                                         :
 65 |                                         <a :href="scope._url" target="_blank"
 66 |                                             rel="noopener noreferrer">${scope._url}</a>
 67 |                                     </el-popover>
 68 |                                 </template>
 69 |                             </el-table-column>
 70 |                             <el-table-column prop="interval" sortable="custom" label="interval" width="100">
 71 |                             </el-table-column>
 72 |                             <el-table-column prop="work_hours" label="work_hours" width="120"></el-table-column>
 73 |                             <el-table-column label="last_change_time" prop="last_change_time" sortable="custom"
 74 |                                 width="165">
 75 |                                 <template slot-scope="scope">
 76 |                                     <el-link @click="show_time(scope.row)" type="info" :underline="false"
 77 |                                         :title="scope.row.last_change_time.replace(/\..*/, '').replace('T', ' ')+'. Click to read more.'">
 78 |                                         <b>${scope.row.timeago} ago</b><br />
 79 |                                         <!-- ${scope.row.last_change_time.replace(/\..*/, '').replace('T', ' ')} -->
 80 |                                     </el-link>
 81 |                                 </template>
 82 |                             </el-table-column>
 83 |                             <el-table-column prop="custom_info" label="custom_info" width="160"></el-table-column>
 84 |                             <el-table-column label="latest_result" min-width="160">
 85 |                                 <template slot-scope="scope">
 86 |                                     <i v-if="scope.row.error" class="el-icon-warning" circle size="mini"
 87 |                                         style="margin: 0;" @click="show_task_error(scope.row)"> </i>
 88 |                                     <el-link style="display: inline;"
 89 |                                         :href="JSON.parse(scope.row.latest_result).url||scope.row.origin_url"
 90 |                                         type="info" target="_blank" :underline="false">
 91 |                                         ${get_latest_result(scope.row.latest_result, 80)}
 92 |                                     </el-link>
 93 |                                     <el-link style="display: inline;" @click="show_result_list(scope.row)" type="info"
 94 |                                         :underline="false" title="Click to read more.">
 95 |                                         <i class="el-icon-more-outline" style="margin-left: 0.1em;"></i>
 96 |                                     </el-link>
 97 |                                 </template>
 98 |                             </el-table-column>
 99 |                             <el-table-column fixed="right" :label="task_list.length + ' Tasks'" min-width="100">
100 |                                 <template slot-scope="scope">
101 |                                     <el-switch v-model="scope.row.enable" :active-value="1" :inactive-value="0"
102 |                                         active-color="#13ce66" inactive-color="#ff4949"
103 |                                         @change="change_enable(scope.row)" size="mini"
104 |                                         title="Switch between enable/disable">
105 |                                     </el-switch>
106 |                                     <el-button type="primary" icon="el-icon-caret-right"
107 |                                         @click="force_crawl(scope.$index, scope.row)" size="mini" style="margin: 0;"
108 |                                         circle title="Force Crawl"></el-button>
109 |                                     <el-button type="warning" icon="el-icon-edit" @click="update_task(scope.row)"
110 |                                         size="mini" style="margin: 0;" circle
111 |                                         title="Edit Task. Double click the row for shortcut."></el-button>
112 |                                     <el-button type="danger" icon="el-icon-delete" circle title="Delete Task"
113 |                                         size="mini" style="margin: 0;" @click="delete_task(scope.$index, scope.row)">
114 |                                     </el-button>
115 |                                 </template>
116 |                             </el-table-column>
117 |                             <template slot="append">
118 |                                 <br id="auto_load">
119 |                             </template>
120 |                         </el-table>
121 |                         <div class="foot">
122 |                             <el-button type="success" style="margin: 0; width: 20%;" @click="load_tasks"
123 |                                 :disabled="!has_more" plain> Page:
124 |                                 ${current_page} - Load More</el-button>
125 | 
126 |                             <el-button type="primary" @click="reload_tasks" style="margin: 0;" title="Refresh"
127 |                                 icon="el-icon-refresh-right"></el-button>
128 |                             <a href="{{rss_url}}" target="_blank" rel="noopener noreferrer" style="margin: 0;">
129 |                                 <el-button title="RSS Feed" style="margin: 0;" type="warning" icon="el-icon-star-off">
130 |                                 </el-button>
131 |                             </a>
132 |                             <a href="{{lite_url}}" target="_blank" rel="noopener noreferrer" style="margin: 0;">
133 |                                 <el-button title="Lite page for mobile view" type="success" style="margin: 0;"
134 |                                     icon="el-icon-mobile-phone"></el-button>
135 |                             </a>
136 |                             <a href="{{rss_feeds_url}}" target="_blank" rel="noopener noreferrer" style="margin: 0;">
137 |                                 <el-button title="RSS Timeline" style="margin: 0;" type="warning" icon="el-icon-star-on">
138 |                                 </el-button>
139 |                             </a>
140 |                             <a href="{{feeds_url}}" target="_blank" rel="noopener noreferrer" style="margin: 0;">
141 |                                 <el-button title="Timeline Feeds" type="success" style="margin: 0;"
142 |                                     icon="el-icon-s-operation"></el-button>
143 |                             </a>
144 |                             <el-popover placement="top-start" width="200" trigger="hover">
145 |                                 <a v-for="(a,index) in custom_links" :href="a.url" :title="a.desc || ''"
146 |                                     :name="a.name || a.label" target="_blank">
147 |                                     <p class="custom_links">${ a.label || a.name }</p>
148 |                                 </a>
149 |                                 <el-button style="margin: 0;" title="Config.custom_links" icon="el-icon-menu"
150 |                                     slot="reference"></el-button>
151 |                             </el-popover>
152 |                         </div>
153 |                     </template>
154 |                 </el-tab-pane>
155 |                 <el-tab-pane label="New" name="new">
156 |                     <iframe v-if="'new' in clicked_tab_names" @load="init_iframe()" src="/uniparser/" frameborder="0"
157 |                         id="uni_iframe" style="width: 100%; height: 90%;"></iframe>
158 |                     <div style="display: flex;justify-content:center;">
159 |                         <el-popover placement="top-start" title="WARNING" style="margin-right: 0.5em;" width="200"
160 |                             trigger="hover"><span style="font-size: 1em;">
161 |                                 ParseRule's name should have these keys to generate RSS:
162 |                                 <br />
163 |                                 <b>text</b>
164 |                                 <br />
165 |                                 <b>url </b>[Optional]
166 |                             </span>
167 |                             <i slot="reference" class="el-icon-warning"></i>
168 |                         </el-popover>
169 |                         <el-button icon="el-icon-upload" type="warning" @click="process_crawler_rule('add', null, 0)">
170 |                             1. Save Crawler Rule
171 |                         </el-button>
172 |                         <el-button icon="el-icon-plus" type="success" @click="show_form_add_new_task(true)"> 2. Add New
173 |                             Task</el-button>
174 |                     </div>
175 |                 </el-tab-pane>
176 |                 <el-tab-pane label="Rules" name="rules">
177 |                     <div v-if="'rules' in clicked_tab_names" style="width: 65%; margin: 0 auto;">
178 |                         <el-form @submit.prevent.native="load_hosts" id="input_host_form" label-position="left" inline>
179 |                             <el-form-item>
180 |                                 <el-input v-model="current_host" title="Press enter to submit"
181 |                                     placeholder="Host or URL for filter, null to fetch all.">
182 |                                     <template slot="prepend">Host</template>
183 |                                     <el-button @click="load_hosts" slot="append" icon="el-icon-search"></el-button>
184 |                                 </el-input>
185 |                             </el-form-item>
186 |                         </el-form>
187 |                         <div class="host-tags">
188 |                             <el-tag class="host-tag" v-for="(host,index) in visible_host_list" :key="host"
189 |                                 @click="show_host_rule(host.name)" effect="dark" :type="tag_types[index%5]">
190 |                                 ${ host.name } <i v-if="host.freq" class="el-icon-lock"></i>
191 |                             </el-tag>
192 |                         </div>
193 |                     </div>
194 |                 </el-tab-pane>
195 |                 <el-tab-pane v-for="tab in custom_tabs" :title="tab.desc || ''" :label="tab.label || tab.name"
196 |                     :name="tab.name || tab.label">
197 |                     <iframe v-if="(tab.name || tab.label) in clicked_tab_names" :src="tab.url" frameborder="0"
198 |                         style="width: 100%; height: 90%;"></iframe>
199 |                 </el-tab-pane>
200 |             </el-tabs>
201 |         </template>
202 |         <el-dialog title="Task Info" :visible.sync="task_info_visible" :close-on-click-modal="false">
203 |             <el-form :model="new_task_form">
204 |                 <el-form-item label="Task ID" title="Unique task id">
205 |                     <el-input v-model="new_task_form.task_id" disabled autocomplete="off"
206 |                         placeholder="Task ID, for new task is null"></el-input>
207 |                 </el-form-item>
208 |                 <el-form-item label="Task Name" title="Input one unique name">
209 |                     <el-input v-model="new_task_form.name" autocomplete="off" title="Press enter to submit"
210 |                         @keyup.enter.native="add_new_task" placeholder="Task name, Press enter to submit."></el-input>
211 |                 </el-form-item>
212 |                 <el-form-item>
213 |                     <el-switch active-text="Enable" :active-value="1" :inactive-value="0" inactive-text="Disable"
214 |                         v-model="new_task_form.enable" active-color="#13ce66" inactive-color="#ff4949">
215 |                     </el-switch>
216 |                 </el-form-item>
217 |                 <el-form-item title="Tag for grouping" label="Tag">
218 |                     <el-input v-model="new_task_form.tag" autocomplete="on" title="Press enter to submit"
219 |                         @keyup.enter.native="add_new_task"></el-input>
220 |                 </el-form-item>
221 |                 <el-form-item label="Error" title="An error occurred when trying to crawl">
222 |                     <el-input v-model="new_task_form.error" disabled autocomplete="off" placeholder="No errors.">
223 |                     </el-input>
224 |                 </el-form-item>
225 |                 <el-form-item title="Input the url, or request_args JSON, or cURL string" label="request_args">
226 |                     <el-button @click="view_crawler_rule_by_req(new_task_form.request_args)" size="mini"
227 |                         icon="el-icon-view"><b>Rule</b></el-button>
228 |                     <el-button @click="view_host_by_req(new_task_form.request_args)" size="mini" icon="el-icon-view">
229 |                         <b>Host</b>
230 |                     </el-button>
231 |                     <span>
232 |                         | "retry": 2, "timeout": 3, "ssl": false, and other args refer to
233 |                         <a href="https://docs.aiohttp.org/en/stable/client_reference.html#aiohttp.ClientSession.request"
234 |                             target="_blank" rel="noopener noreferrer">aiohttp</a>
235 |                     </span>
236 |                     <el-input type="textarea" :validate-event="true" :autosize="{ minRows: 2, maxRows: 4}"
237 |                         v-model="new_task_form.request_args" autocomplete="off"></el-input>
238 |                 </el-form-item>
239 |                 <el-form-item title="The url for tracing the source" label="origin_url">
240 |                     <el-input v-model="new_task_form.origin_url" autocomplete="off" title="Press enter to submit"
241 |                         @keyup.enter.native="add_new_task" placeholder="default to request_args.url"></el-input>
242 |                 </el-form-item>
243 |                 <el-form-item title="Crawling loop interval seconds." label="interval">
244 |                     <el-link style="margin-left: 1em;" type="danger" @click="new_task_form.interval=300">5 mins
245 |                     </el-link>
246 |                     |
247 |                     <el-link type="danger" @click="new_task_form.interval=600">10 mins</el-link>
248 |                     |
249 |                     <el-link type="warning" @click="new_task_form.interval=1800">30 mins</el-link>
250 |                     |
251 |                     <el-link type="warning" @click="new_task_form.interval=3600">1 hrs</el-link>
252 |                     |
253 |                     <el-link type="success" @click="new_task_form.interval=3600*3">3 hrs</el-link>
254 |                     |
255 |                     <el-link type="success" @click="new_task_form.interval=3600*6">6 hrs</el-link>
256 |                     |
257 |                     <el-link type="primary" @click="new_task_form.interval=3600*12">12 hrs</el-link>
258 |                     |
259 |                     <el-link type="primary" @click="new_task_form.interval=86400">1 day</el-link>
260 |                     |
261 |                     <el-link type="info" @click="new_task_form.interval=86400*7">7 days</el-link>
262 |                     |
263 |                     <el-link type="info" @click="new_task_form.interval=86400*30">30 days</el-link>
264 |                     <el-input v-model="new_task_form.interval" autocomplete="off" title="Press enter to submit"
265 |                         @keyup.enter.native="add_new_task"></el-input>
266 |                 </el-form-item>
267 |                 <el-form-item title="Crawler only works at hour in work_hours" label="work_hours">
268 |                     <i class="el-icon-info" title="Click for more info" @click="show_work_hours_doc"></i>
269 |                     <el-input v-model="new_task_form.work_hours" autocomplete="off" title="Press enter to submit"
270 |                         @keyup.enter.native="add_new_task" placeholder="time range: 0, 24. hours list: [20, 21, 22]">
271 |                     </el-input>
272 |                 </el-form-item>
273 |                 <el-form-item title="Max length of the result list." label="max_result_count">
274 |                     <el-input v-model="new_task_form.max_result_count" autocomplete="off" title="Press enter to submit"
275 |                         @keyup.enter.native="add_new_task"></el-input>
276 |                 </el-form-item>
277 |                 <el-form-item title="History of the result list." label="result_list">
278 |                     <el-input v-model="new_task_form.result_list" type="textarea" title="Standard JSON list.">
279 |                     </el-input>
280 |                 </el-form-item>
281 |                 <el-form-item label="custom_info">
282 |                     <el-input type="textarea" autosize v-model="new_task_form.custom_info" clearable
283 |                         placeholder="string split by ':' as callback_name:arg" autocomplete="off"></el-input>
284 |                     <b>Callbacks:</b>
285 |                     <div style="display: inline;" v-for="(value, name) in callback_workers">
286 |                         <span class="cb_name" @click="click_cb_name(name)">${name||'default-callback'}</span>
287 |                     </div>
288 |                     <pre
289 |                         style="margin: 0px; border: 1px solid #8080802b; padding-left: 1em;"><code style="margin: 0;">${current_cb_doc}</code></pre>
290 |                 </el-form-item>
291 |             </el-form>
292 |             <div slot="footer" class="dialog-footer">
293 |                 <el-button @click="task_info_visible = false">Cancel</el-button>
294 |                 <el-button type="primary" @click="add_new_task">Submit</el-button>
295 |             </div>
296 |         </el-dialog>
297 |         <el-dialog :title="'HostRule: '+current_host_rule.host||''" :close-on-click-modal="true"
298 |             :visible.sync="rule_info_visible" :close-on-click-modal="false">
299 |             <el-button type="danger" icon="el-icon-delete" @click="delete_host_rule(current_host_rule.host)">Delete
300 |             </el-button>
301 |             <el-row>
302 |                 <h4 title="Example: n=1, interval=3 means crawl once each 3 seconds.">Frequency: <span>Send [n] request
303 |                         each [interval] seconds</span></h4>
304 |                 <el-col :span="8">
305 |                     <el-input clearable
306 |                         placeholder="Number of concurrent, default to null / 0, will remove frequency controller)."
307 |                         title="Number of concurrent, default to null / 0, will remove frequency controller)."
308 |                         v-model="current_host_rule.n">
309 |                         <template slot="prepend">n:</template>
310 |                     </el-input>
311 |                 </el-col>
312 |                 <el-col :span="8">
313 |                     <el-input clearable placeholder="Crawling interval seconds, default to 0."
314 |                         title="Crawling interval seconds, default to 0." v-model="current_host_rule.interval">
315 |                         <template slot="prepend">interval:</template>
316 |                     </el-input>
317 |                 </el-col>
318 |                 <el-col :span="8">
319 |                     <el-button @click="update_frequency" type="warning">Update Frequency </el-button>
320 |                 </el-col>
321 |             </el-row>
322 |             <el-row :gutter="20">
323 |                 <el-col :span="8">
324 |                     <h4>Name</h4>
325 |                 </el-col>
326 |                 <el-col :span="12">
327 |                     <h4>Regex</h4>
328 |                 </el-col>
329 |                 <el-col :span="4">
330 |                     <h4>/</h4>
331 |                 </el-col>
332 |             </el-row>
333 |             <div v-for="rule in current_host_rule.crawler_rules">
334 |                 <el-row :gutter="20">
335 |                     <el-col :span="8">
336 |                         <el-link @click="view_crawler_rule(rule)" title="View the CrawlerRule in uniparser console"
337 |                             type="primary">
338 |                             ${rule.name}
339 |                         </el-link>
340 |                     </el-col>
341 |                     <el-col :span=" 12">
342 |                         ${rule.regex}
343 |                     </el-col>
344 |                     <el-col :span="4">
345 |                         <i title="View the CrawlerRule in uniparser console" @click="view_crawler_rule(rule)"
346 |                             style="zoom: 1.5; cursor: pointer; margin-right: 1em;" class="el-icon-view"></i>
347 |                         <i title="Edit JSON" @click="edit_crawler_rule(rule)"
348 |                             style="zoom: 1.5; cursor: pointer; margin-right: 1em;" class="el-icon-edit-outline"></i>
349 |                         <i title="Delete" @click="process_crawler_rule('pop', rule, 0)"
350 |                             style="zoom: 1.5; cursor: pointer; margin-right: 1em;" class="el-icon-delete"></i>
351 |                     </el-col>
352 |                 </el-row>
353 |             </div>
354 |         </el-dialog>
355 |     </div>
356 |     <!-- [CSS Minifier](https://cssminifier.com/)
357 |     [JavaScript Minifier](https://javascript-minifier.com/)
358 |     [HTML Minifier](https://html-minifier.com/) -->
359 | 
360 |     <!-- <script src="/static/js/watchdogs.js"></script> -->
361 |     <script src="/static/js/watchdogs.min.js"></script>
362 | </body>
363 | 
364 | </html>
365 | 


--------------------------------------------------------------------------------
/watchdogs/app.py:
--------------------------------------------------------------------------------
  1 | from base64 import b64encode
  2 | from collections import deque
  3 | from datetime import datetime
  4 | from json import JSONDecodeError, dumps, loads
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | import aiofiles
  9 | from fastapi import Cookie, FastAPI, Header
 10 | from fastapi.staticfiles import StaticFiles
 11 | from starlette.requests import Request
 12 | from starlette.responses import (
 13 |     FileResponse,
 14 |     HTMLResponse,
 15 |     JSONResponse,
 16 |     RedirectResponse,
 17 |     Response,
 18 | )
 19 | from starlette.templating import Jinja2Templates
 20 | from torequests.utils import timeago, ttime
 21 | from uniparser import CrawlerRule, Uniparser
 22 | from uniparser.fastapi_ui import app as sub_app
 23 | from uniparser.utils import get_host
 24 | 
 25 | from . import __version__
 26 | from .config import md5_checker
 27 | from .crawler import crawl_once, find_next_check_time
 28 | from .models import (
 29 |     Group,
 30 |     Task,
 31 |     groups,
 32 |     query_all_groups,
 33 |     query_feeds,
 34 |     query_group_task_ids,
 35 |     query_task_errors,
 36 |     query_tasks,
 37 |     tasks,
 38 | )
 39 | from .settings import (
 40 |     Config,
 41 |     get_host_freq_list,
 42 |     refresh_token,
 43 |     release_app,
 44 |     set_host_freq,
 45 |     setup_app,
 46 | )
 47 | from .utils import format_size, gen_rss
 48 | 
 49 | description = "Watchdogs to keep an eye on the world's change.\nRead more: [https://github.com/ClericPy/watchdogs](https://github.com/ClericPy/watchdogs)"
 50 | app = FastAPI(title="Watchdogs", description=description, version=__version__)
 51 | 
 52 | Config.setup_middleware(app)
 53 | sub_app.openapi_prefix = '/uniparser'
 54 | app.mount("/uniparser", sub_app)
 55 | app.mount("/static",
 56 |           StaticFiles(directory=str((Path(__file__).parent /
 57 |                                      'static').absolute())),
 58 |           name="static")
 59 | logger = Config.logger
 60 | templates = Jinja2Templates(directory=str((Path(__file__).parent /
 61 |                                            'templates').absolute()))
 62 | 
 63 | 
 64 | @app.on_event("startup")
 65 | async def startup():
 66 |     await setup_app(app)
 67 | 
 68 | 
 69 | @app.on_event("shutdown")
 70 | async def shutdown():
 71 |     await release_app(app)
 72 | 
 73 | 
 74 | @app.post('/auth')
 75 | async def post_auth(request: Request,
 76 |                     watchdog_auth: str = Cookie(''),
 77 |                     redirect: str = '/'):
 78 |     # two scene for set new password, update new password if has password, else return the html
 79 |     # 1. not set watchdog_auth; 2. already authenticated
 80 |     password = loads(await request.body())['password']
 81 |     auth_not_set = not Config.watchdog_auth
 82 |     already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth
 83 |     need_new_pwd = auth_not_set or already_authed
 84 |     if password:
 85 |         if need_new_pwd:
 86 |             old_password = Config.password
 87 |             Config.password = password
 88 |             await refresh_token()
 89 |             resp = JSONResponse({'ok': True, 'redirect': redirect})
 90 |             resp.set_cookie('watchdog_auth',
 91 |                             Config.watchdog_auth,
 92 |                             max_age=Config.cookie_max_age,
 93 |                             httponly=True)
 94 |             logger.warning(
 95 |                 f'password changed {old_password}->{Config.password}.')
 96 |             return resp
 97 |         elif (await md5_checker(password, Config.watchdog_auth, freq=True)):
 98 |             resp = JSONResponse({'ok': True, 'redirect': redirect})
 99 |             resp.set_cookie('watchdog_auth',
100 |                             Config.watchdog_auth,
101 |                             max_age=Config.cookie_max_age,
102 |                             httponly=True)
103 |             logger.info('correct password, login success.')
104 |             return resp
105 |     # invalid password, clear cookie
106 |     resp = JSONResponse({'ok': False})
107 |     # resp.set_cookie('watchdog_auth', '')
108 |     resp.delete_cookie('watchdog_auth')
109 |     logger.info(f'invalid password: {password}')
110 |     return resp
111 | 
112 | 
113 | @app.get('/auth')
114 | async def auth(request: Request,
115 |                watchdog_auth: str = Cookie(''),
116 |                redirect: str = '/'):
117 |     auth_not_set = not Config.watchdog_auth
118 |     already_authed = watchdog_auth and watchdog_auth == Config.watchdog_auth
119 |     need_new_pwd = auth_not_set or already_authed
120 |     context: dict = {'request': request}
121 |     context['version'] = __version__
122 |     if need_new_pwd:
123 |         context['action'] = 'Init'
124 |         context['prompt_title'] = 'Set a new password'
125 |     else:
126 |         context['action'] = 'Login'
127 |         context['prompt_title'] = 'Input the password'
128 |     return templates.TemplateResponse("auth.html", context=context)
129 | 
130 | 
131 | @app.get("/")
132 | async def index(request: Request, tag: str = ''):
133 |     kwargs: dict = {'request': request}
134 |     kwargs['cdn_urls'] = Config.cdn_urls
135 |     kwargs['version'] = __version__
136 |     kwargs['rss_url'] = Config.get_route('/rss', tag=tag)
137 |     kwargs['lite_url'] = Config.get_route('/lite', tag=tag)
138 |     kwargs['feeds_url'] = Config.get_route('/feeds', tag=tag)
139 |     kwargs['rss_feeds_url'] = Config.get_route('/rss_feeds', tag=tag)
140 |     init_vars_json = dumps({
141 |         'custom_links': Config.custom_links,
142 |         'callback_workers': Config.callback_handler.workers,
143 |         'custom_tabs': Config.custom_tabs,
144 |         'work_hours_doc': find_next_check_time.__doc__,
145 |     })
146 |     init_vars_b64 = b64encode(init_vars_json.encode('u8')).decode('u8')
147 |     kwargs['init_vars'] = init_vars_b64
148 |     return templates.TemplateResponse("index.html", context=kwargs)
149 | 
150 | 
151 | @app.get("/favicon.ico")
152 | async def favicon():
153 |     return RedirectResponse('/static/img/favicon.svg', 301)
154 | 
155 | 
156 | @app.post("/add_new_task")
157 | async def add_new_task(task: Task):
158 |     try:
159 |         if task.interval < 60:
160 |             raise ValueError('interval should not less than 60 seconds.')
161 |         db = Config.db
162 |         # check exist
163 |         if task.task_id is None:
164 |             # insert new task
165 |             query = tasks.insert()
166 |             values = dict(task)
167 |             if not values.get('error'):
168 |                 values['error'] = ''
169 |             # insert with task_id is None
170 |             await db.execute(query=query, values=values)
171 |         else:
172 |             # update old task
173 |             query = 'update tasks set `name`=:name,`enable`=:enable,`tag`=:tag,`request_args`=:request_args,`origin_url`=:origin_url,`interval`=:interval,`work_hours`=:work_hours,`max_result_count`=:max_result_count,`custom_info`=:custom_info,`next_check_time`=:next_check_time where `task_id`=:task_id'
174 |             values = {
175 |                 'task_id': task.task_id,
176 |                 'name': task.name,
177 |                 'enable': task.enable,
178 |                 'tag': task.tag,
179 |                 'request_args': task.request_args,
180 |                 'origin_url': task.origin_url,
181 |                 'interval': task.interval,
182 |                 'work_hours': task.work_hours,
183 |                 'max_result_count': task.max_result_count,
184 |                 'custom_info': task.custom_info,
185 |                 'next_check_time': datetime.now(),
186 |             }
187 |             await db.execute(query=query, values=values)
188 |         result = {'msg': 'ok'}
189 |         query_tasks.cache_clear()
190 |     except Exception as e:
191 |         result = {'msg': repr(e)}
192 |     logger.info(
193 |         f'{"[Add]" if task.task_id is None else "[Update]"} task {task}: {result}'
194 |     )
195 |     return result
196 | 
197 | 
198 | @app.get("/delete_task")
199 | async def delete_task(task_id: int):
200 |     try:
201 |         query = tasks.delete().where(tasks.c.task_id == task_id)
202 |         await Config.db.execute(query=query)
203 |         result = {'msg': 'ok'}
204 |         query_tasks.cache_clear()
205 |     except Exception as e:
206 |         result = {'msg': repr(e)}
207 |     logger.info(f'[Delete] task {task_id}: {result}')
208 |     return result
209 | 
210 | 
211 | @app.get("/force_crawl")
212 | async def force_crawl(task_name: str):
213 |     try:
214 |         task = await crawl_once(task_name=task_name)
215 |         task['timeago'] = timeago(
216 |             (datetime.now() - task['last_change_time']).total_seconds(),
217 |             1,
218 |             1,
219 |             short_name=True)
220 |         result = {'msg': 'ok', 'task': task}
221 |     except Exception as e:
222 |         result = {'msg': repr(e)}
223 |     logger.info(f'[Force] crawl {task_name}: {result}')
224 |     return result
225 | 
226 | 
227 | @app.get("/load_tasks")
228 | async def load_tasks(
229 |     task_name: Optional[str] = None,
230 |     page: int = 1,
231 |     page_size: int = Config.default_page_size,
232 |     order_by: str = 'last_change_time',
233 |     sort: str = 'desc',
234 |     tag: str = '',
235 | ):
236 |     try:
237 |         _result, has_more = await query_tasks(
238 |             task_name=task_name,
239 |             page=page,
240 |             page_size=page_size,
241 |             order_by=order_by,
242 |             sort=sort,
243 |             tag=tag,
244 |         )
245 |         _result = [task for task in _result]
246 |         now = datetime.now()
247 |         for item in _result:
248 |             item['timeago'] = timeago(
249 |                 (now - item['last_change_time']).total_seconds(),
250 |                 1,
251 |                 1,
252 |                 short_name=True)
253 |         result = {'msg': 'ok', 'tasks': _result, 'has_more': has_more}
254 |     except Exception as e:
255 |         result = {'msg': str(e), 'tasks': [], 'has_more': False}
256 |     return result
257 | 
258 | 
259 | @app.get("/enable_task")
260 | async def enable_task(task_id: int, enable: int = 1):
261 |     query = 'update tasks set `enable`=:enable where `task_id`=:task_id'
262 |     values = {'task_id': task_id, 'enable': enable}
263 |     try:
264 |         _result = await Config.db.execute(query, values)
265 |         result = {'msg': 'ok', 'updated': _result}
266 |         query_tasks.cache_clear()
267 |     except Exception as e:
268 |         result = {'msg': repr(e)}
269 |     return result
270 | 
271 | 
272 | @app.get('/load_hosts')
273 | async def load_hosts(host: str = ''):
274 |     host = get_host(host) or host
275 |     query = 'select `host` from host_rules'
276 |     if host:
277 |         query += ' where `host` like :host'
278 |         values = {'host': f'%{host}%'}
279 |     else:
280 |         values = {}
281 |     query += ' order by `host` asc'
282 |     _result = await Config.db.fetch_all(query, values)
283 |     host_freqs = Uniparser._HOST_FREQUENCIES
284 |     hosts = [{
285 |         'name': getattr(i, 'host', None),
286 |         'freq': getattr(i, 'host', None) in host_freqs
287 |     } for i in _result]
288 |     return {'hosts': hosts, 'host': host}
289 | 
290 | 
291 | @app.get("/get_host_rule")
292 | async def get_host_rule(host: str):
293 |     try:
294 |         if not host:
295 |             raise ValueError('host name should not be null')
296 |         query = 'select `host_rule` from host_rules where `host`=:host'
297 |         values = {'host': host}
298 |         _result = await Config.db.fetch_one(query, values)
299 |         host_rule = getattr(_result, 'host_rule', None)
300 |         host_rule = loads(host_rule) if host_rule else {"host": host}
301 |         host_rule['n'], host_rule['interval'] = get_host_freq_list(host)
302 |         result = {'msg': 'ok', 'host_rule': host_rule}
303 |     except Exception as e:
304 |         result = {'msg': repr(e)}
305 |     logger.info(f'[Get] host_rule {host}: {result}')
306 |     return result
307 | 
308 | 
309 | @app.post("/crawler_rule.{method}")
310 | async def crawler_rule(method: str,
311 |                        rule: CrawlerRule,
312 |                        force: Optional[int] = 0):
313 |     try:
314 |         if not rule['name']:
315 |             raise ValueError('rule name can not be null')
316 |         if method == 'add':
317 |             if force:
318 |                 exist_rule = await Config.rule_db.find_crawler_rule(
319 |                     rule['request_args']['url'])
320 |                 if exist_rule:
321 |                     logger.info(
322 |                         f'add crawler_rule force=1, old rule removed: {exist_rule}'
323 |                     )
324 |                     await Config.rule_db.pop_crawler_rule(exist_rule)
325 |             _result = await Config.rule_db.add_crawler_rule(rule)
326 |         elif method == 'pop':
327 |             _result = await Config.rule_db.pop_crawler_rule(rule)
328 |         else:
329 |             raise ValueError('method only support add and pop')
330 |         result = {'msg': 'ok', 'result': _result}
331 |     except Exception as e:
332 |         result = {'msg': repr(e)}
333 |     logger.info(f'[{method.title()}] crawler rule {rule}: {result}')
334 |     return result
335 | 
336 | 
337 | @app.post("/find_crawler_rule")
338 | async def find_crawler_rule(request_args: dict):
339 |     try:
340 |         url = request_args.get('url')
341 |         rule: CrawlerRule = await Config.rule_db.find_crawler_rule(url)
342 |         if not rule:
343 |             raise ValueError(f'rule not found for given url: {url}')
344 |         result = {'msg': 'ok', 'result': rule.dumps()}
345 |     except Exception as e:
346 |         result = {'msg': repr(e)}
347 |     logger.info(f'[Find] crawler rule: {result}')
348 |     return result
349 | 
350 | 
351 | @app.get("/delete_host_rule")
352 | async def delete_host_rule(host: str):
353 |     try:
354 |         if not host:
355 |             raise ValueError('host should not be null')
356 |         await Config.rule_db.pop_host_rule(host)
357 |         result = {'msg': 'ok'}
358 |     except Exception as e:
359 |         result = {'msg': repr(e)}
360 |     logger.info(f'[Delete] host rule {host}: {result}')
361 |     return result
362 | 
363 | 
364 | @app.get("/log")
365 | async def log(request: Request,
366 |               max_lines: int = 50,
367 |               refresh_every: int = 0,
368 |               log_names: str = 'info-server-error'):
369 |     window: deque = deque((), max_lines)
370 |     names: list = log_names.split('-')
371 |     items = []
372 |     for name in names:
373 |         file_name = f'{name}.log'
374 |         fp: Path = Config.CONFIG_DIR / file_name
375 |         if not fp.is_file():
376 |             continue
377 |         fp_stat = fp.stat()
378 |         file_size = format_size(fp_stat.st_size)
379 |         st_mtime = ttime(fp_stat.st_mtime)
380 |         line_no = 0
381 |         async with aiofiles.open(fp, encoding=Config.ENCODING) as f:
382 |             async for line in f:
383 |                 line_no += 1
384 |                 window.append(line)
385 |         item = {
386 |             'name': name,
387 |             'line_no': line_no,
388 |             'file_size': file_size,
389 |             'st_mtime': st_mtime,
390 |             'log_text': "".join(window),
391 |             'file_size_mb': Config.LOGGING_FILE_CONFIG.get(file_name, {}).get(
392 |                 'file_size_mb', '-1'),
393 |         }
394 |         items.append(item)
395 |         window.clear()
396 |     context = {
397 |         'request': request,
398 |         'items': items,
399 |         'log_names': log_names,
400 |         'refresh_every': refresh_every,
401 |         'max_lines': max_lines,
402 |     }
403 |     return templates.TemplateResponse("logs.html", context=context)
404 | 
405 | 
406 | @app.get("/log.clear")
407 | async def log_clear(log_names: str = 'info-server-error',
408 |                     current_names: str = 'info-server-error'):
409 |     names: list = log_names.split('-')
410 |     for name in names:
411 |         fp: Path = Config.CONFIG_DIR / f'{name}.log'
412 |         if not fp.is_file():
413 |             continue
414 |         # use sync writing to block the main thread
415 |         fp.write_bytes(b'')
416 |         logger.info(f'{name}.log cleared')
417 |     html = f'<meta http-equiv="refresh" content="0; url=/log?log_names={current_names}" />{log_names} log cleared. Redirecting back.'
418 |     return HTMLResponse(html)
419 | 
420 | 
421 | @app.get("/update_host_freq")
422 | async def update_host_freq(host: str,
423 |                            n: Optional[int] = 0,
424 |                            interval: Optional[int] = 0):
425 |     try:
426 |         if not host:
427 |             raise ValueError('host should not be null')
428 |         await set_host_freq(host, n=n, interval=interval)
429 |         result = {'msg': 'ok'}
430 |     except Exception as e:
431 |         result = {'msg': repr(e)}
432 |     logger.info(f'[Update] host frequency {host}: {result}')
433 |     return result
434 | 
435 | 
436 | @app.get("/rss")
437 | async def rss(
438 |         request: Request,
439 |         tag: str = '',
440 |         sign: str = '',
441 |         host: str = Header('', alias='Host'),
442 |         group_ids: str = '',
443 | ):
444 |     if group_ids:
445 |         task_ids = tuple(await query_group_task_ids(group_ids))
446 |         if not task_ids:
447 |             return JSONResponse(
448 |                 status_code=404,
449 |                 content={
450 |                     "message": 'query no tasks',
451 |                 },
452 |             )
453 |         tasks, _ = await query_tasks(task_ids=task_ids)
454 |     else:
455 |         tasks, _ = await query_tasks(tag=tag)
456 |     source_link = f'https://{host}'
457 |     xml_data: dict = {
458 |         'channel': {
459 |             'title': 'Watchdogs',
460 |             'description': f'Watchdog on web change, v{__version__}.',
461 |             'link': source_link,
462 |         },
463 |         'items': []
464 |     }
465 |     for task in tasks:
466 |         pubDate: str = task['last_change_time'].strftime(
467 |             format='%a, %d %b %Y %H:%M:%S')
468 |         latest_result: dict = loads(task['latest_result'] or '{}')
469 |         if isinstance(latest_result, list):
470 |             logger.error(f'latest_result is list: {latest_result}')
471 |         link: str = latest_result.get('url') or task['origin_url']
472 |         description: str = latest_result.get('text') or ''
473 |         title: str = f'{task["name"]}#{latest_result.get("title", description[:Config.TEXT_SLICE_LENGTH])}'
474 |         item: dict = {
475 |             'title': title,
476 |             'link': link,
477 |             'guid': title,
478 |             'description': description,
479 |             'pubDate': pubDate
480 |         }
481 |         xml_data['items'].append(item)
482 |     xml: str = gen_rss(xml_data)
483 |     response = Response(
484 |         content=xml,
485 |         media_type="application/xml",
486 |         headers={'Content-Type': 'application/xml; charset="utf-8"'})
487 |     return response
488 | 
489 | 
490 | @app.post("/lite")
491 | async def post_lite(request: Request, tag: str = '', sign: str = ''):
492 |     task_id = loads(await request.body())['task_id']
493 |     tasks, _ = await query_tasks(task_id=task_id)
494 |     if tasks:
495 |         task = tasks[0]
496 |         try:
497 |             result_list = loads(
498 |                 task['result_list']) if task['result_list'] else []
499 |         except JSONDecodeError:
500 |             result_list = []
501 |         return {'result_list': result_list}
502 |     else:
503 |         return {'result_list': []}
504 | 
505 | 
506 | @app.get("/lite")
507 | async def lite(
508 |     request: Request,
509 |     tag: str = '',
510 |     sign: str = '',
511 |     page: int = 1,
512 |     group_ids: str = '',
513 | ):
514 |     if group_ids:
515 |         task_ids = tuple(await query_group_task_ids(group_ids))
516 |         if not task_ids:
517 |             return JSONResponse(
518 |                 status_code=404,
519 |                 content={
520 |                     "message": 'query no tasks',
521 |                 },
522 |             )
523 |         tasks, has_more = await query_tasks(task_ids=task_ids, page=page)
524 |     else:
525 |         tasks, has_more = await query_tasks(tag=tag, page=page)
526 |     now = datetime.now()
527 |     for task in tasks:
528 |         result = loads(task['latest_result'] or '{}')
529 |         # set / get cache from task
530 |         task['url'] = task.get('url') or result.get('url') or task['origin_url']
531 |         task['text'] = task.get('text') or result.get('title') or result.get(
532 |             'text') or ''
533 |         task['timeago'] = timeago(
534 |             (now - task['last_change_time']).total_seconds(),
535 |             1,
536 |             1,
537 |             short_name=True)
538 |     context = {'tasks': tasks, 'request': request}
539 |     context['version'] = __version__
540 |     if group_ids:
541 |         params = {'group_ids': group_ids}
542 |     else:
543 |         params = {'tag': tag}
544 |     context['home_url'] = Config.get_route('/lite', **params)
545 |     if has_more:
546 |         if group_ids:
547 |             next_page_url = Config.get_route('/lite', page=page + 1, **params)
548 |         else:
549 |             next_page_url = Config.get_route('/lite', page=page + 1, **params)
550 |     else:
551 |         next_page_url = ''
552 |     context['next_page_url'] = next_page_url
553 |     if page > 1:
554 |         if group_ids:
555 |             last_page_url = Config.get_route('/lite', page=page - 1, **params)
556 |         else:
557 |             last_page_url = Config.get_route('/lite', page=page - 1, **params)
558 |     else:
559 |         last_page_url = ''
560 |     context['last_page_url'] = last_page_url
561 |     context['rss_url'] = Config.get_route('/rss', **params)
562 |     return templates.TemplateResponse("lite.html", context=context)
563 | 
564 | 
565 | @app.get("/feeds")
566 | async def feeds(
567 |     request: Request,
568 |     tag: str = '',
569 |     # user: str = '',
570 |     sign: str = '',
571 |     page: int = 1,
572 |     # page_size: int = Config.default_page_size,
573 |     group_ids: str = '',
574 | ):
575 |     error_tasks = []
576 |     if group_ids:
577 |         task_ids = tuple(await query_group_task_ids(group_ids))
578 |         if not task_ids:
579 |             return JSONResponse(
580 |                 status_code=404,
581 |                 content={
582 |                     "message": 'query no tasks',
583 |                 },
584 |             )
585 |         feeds, has_more = await query_feeds(task_ids=task_ids,
586 |                                             tag=tag,
587 |                                             page=page)
588 |         if page == 1:
589 |             error_tasks.extend(await query_task_errors(tag=tag,
590 |                                                        task_ids=task_ids))
591 |     else:
592 |         feeds, has_more = await query_feeds(tag=tag, page=page)
593 |         if page == 1:
594 |             error_tasks.extend(await query_task_errors(tag=tag))
595 |     now = datetime.now()
596 |     _feeds = []
597 |     current_date = None
598 |     today = datetime.today().strftime('%Y-%m-%d')
599 |     for feed in feeds:
600 |         date = feed['ts_create'].strftime('%Y-%m-%d')
601 |         if date != current_date:
602 |             current_date = date
603 |             if date == today:
604 |                 date += ' [Today]'
605 |             _feeds.append({'current_date': date})
606 |         feed['timeago'] = timeago((now - feed['ts_create']).total_seconds(),
607 |                                   1,
608 |                                   1,
609 |                                   short_name=True)
610 |         _feeds.append(feed)
611 |     context = {'feeds': _feeds, 'request': request, 'error_tasks': error_tasks}
612 |     context['version'] = __version__
613 |     if group_ids:
614 |         params = {'group_ids': group_ids}
615 |     else:
616 |         params = {'tag': tag}
617 |     context['home_url'] = Config.get_route('/feeds', **params)
618 |     if has_more:
619 |         if group_ids:
620 |             next_page_url = Config.get_route('/feeds', page=page + 1, **params)
621 |         else:
622 |             next_page_url = Config.get_route('/feeds', page=page + 1, **params)
623 |     else:
624 |         next_page_url = ''
625 |     context['next_page_url'] = next_page_url
626 |     if page > 1:
627 |         if group_ids:
628 |             last_page_url = Config.get_route('/feeds', page=page - 1, **params)
629 |         else:
630 |             last_page_url = Config.get_route('/feeds', page=page - 1, **params)
631 |     else:
632 |         last_page_url = ''
633 |     context['last_page_url'] = last_page_url
634 |     context['rss_url'] = Config.get_route('/rss_feeds', **params)
635 |     return templates.TemplateResponse("feeds.html", context=context)
636 | 
637 | 
638 | @app.get("/rss_feeds")
639 | async def rss_feeds(request: Request,
640 |                     tag: str = '',
641 |                     sign: str = '',
642 |                     host: str = Header('', alias='Host'),
643 |                     group_ids: str = ''):
644 |     if group_ids:
645 |         task_ids = tuple(await query_group_task_ids(group_ids))
646 |         if not task_ids:
647 |             return JSONResponse(
648 |                 status_code=404,
649 |                 content={
650 |                     "message": 'query no tasks',
651 |                 },
652 |             )
653 |         feeds, _ = await query_feeds(task_ids=task_ids)
654 |     else:
655 |         feeds, _ = await query_feeds(tag=tag)
656 |     source_link = f'https://{host}'
657 |     xml_data: dict = {
658 |         'channel': {
659 |             'title': 'Watchdogs Timeline',
660 |             'description': f'Watchdog on web change, v{__version__}.',
661 |             'link': source_link,
662 |         },
663 |         'items': []
664 |     }
665 |     for feed in feeds:
666 |         pubDate: str = feed['ts_create'].strftime(
667 |             format='%a, %d %b %Y %H:%M:%S')
668 |         link: str = feed['url']
669 |         description: str = feed['text']
670 |         title: str = f'{feed["name"]}#{description[:Config.TEXT_SLICE_LENGTH]}'
671 |         item: dict = {
672 |             'title': title,
673 |             'link': link,
674 |             'guid': str(feed['id']),
675 |             'description': description,
676 |             'pubDate': pubDate
677 |         }
678 |         xml_data['items'].append(item)
679 |     xml: str = gen_rss(xml_data)
680 |     response = Response(
681 |         content=xml,
682 |         media_type="application/xml",
683 |         headers={'Content-Type': 'application/xml; charset="utf-8"'})
684 |     return response
685 | 
686 | 
687 | @app.get("/groups")
688 | async def groups_route(request: Request):
689 |     groups = await query_all_groups()
690 |     for _group in groups:
691 |         _group['href_feeds'] = Config.get_route('/feeds',
692 |                                                 group_ids=_group['id'])
693 |         _group['href_lite'] = Config.get_route('/lite', group_ids=_group['id'])
694 |     context = {
695 |         'request': request,
696 |         'groups': groups,
697 |     }
698 |     return templates.TemplateResponse("groups.html", context=context)
699 | 
700 | 
701 | @app.post("/update_group")
702 | async def update_group(group: Group, action: str):
703 |     try:
704 |         db = Config.db
705 |         # check exist
706 |         if action == 'new':
707 |             # insert new task
708 |             query = groups.insert()
709 |             values = dict(group)
710 |             # insert with task_id is None
711 |             resp = await db.execute(query=query, values=values)
712 |         elif action == 'delete':
713 |             query = 'delete from groups where `id`=:id'
714 |             values = {'id': group.id}
715 |             resp = await db.execute(query=query, values=values)
716 |         else:
717 |             # update old task
718 |             query = 'update groups set `name`=:name,`task_ids`=:task_ids where `id`=:id'
719 |             values = {
720 |                 'id': group.id,
721 |                 'name': group.name,
722 |                 'task_ids': group.task_ids,
723 |             }
724 |             resp = await db.execute(query=query, values=values)
725 |         result = {'msg': 'ok', 'resp': str(resp)}
726 |     except Exception as e:
727 |         result = {'msg': repr(e)}
728 |     finally:
729 |         query_all_groups.cache_clear()
730 |         query_group_task_ids.cache_clear()
731 |     logger.info(f'[{action.title()}] {group}: {result}')
732 |     return result
733 | 
734 | 
735 | @app.get("/sqlite")
736 | async def download_db():
737 |     if Config.db_url.startswith('sqlite:///'):
738 |         return FileResponse(path=Config.db_url.replace('sqlite:///', ''))
739 |     return Response(content=b'not sqlite', status_code=404)
740 | 


--------------------------------------------------------------------------------