├── .gitignore ├── setup.cfg ├── test ├── data │ ├── invalid-url.yaml │ ├── urlwatch.yaml │ ├── urls.txt │ └── filter_tests.yaml ├── test_filters.py └── test_handler.py ├── MANIFEST.in ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── requirements-dev.txt ├── lib └── urlwatch │ ├── __init__.py │ ├── ical2txt.py │ ├── migration.py │ ├── main.py │ ├── util.py │ ├── cli.py │ ├── mailer.py │ ├── worker.py │ ├── html2txt.py │ ├── config.py │ ├── handler.py │ ├── jobs.py │ ├── command.py │ ├── storage.py │ ├── filters.py │ └── reporters.py ├── COPYING ├── share ├── urlwatch │ └── examples │ │ ├── urls.yaml.example │ │ └── hooks.py.example └── man │ └── man1 │ └── urlwatch.1 ├── setup.py ├── CHANGELOG.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea 3 | build 4 | /venv -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /test/data/invalid-url.yaml: -------------------------------------------------------------------------------- 1 | name: "invalid url" 2 | url: "https://invalid" 3 | max_tries: 2 4 | --- 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG.md COPYING README.md 2 | recursive-include share * 3 | recursive-include test/data * 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | install: 8 | - python setup.py install_dependencies 9 | script: nosetests -v 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # The COVID Tracking Project Code of Conduct 2 | 3 | The COVID Tracking Project follows the [code of conduct defined in this document](https://github.com/COVID19Tracking/code-of-conduct/blob/master/CODE_OF_CONDUCT.md). -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | certifi==2019.11.28 3 | chardet==3.0.4 4 | cssselect==1.1.0 5 | idna==2.9 6 | keyring==21.2.0 7 | lxml==4.5.0 8 | minidb==2.0.2 9 | PyYAML==5.3.1 10 | requests==2.23.0 11 | urllib3==1.25.8 12 | pytesseract==0.3.3 13 | Pillow==7.0.0 14 | -------------------------------------------------------------------------------- /lib/urlwatch/__init__.py: -------------------------------------------------------------------------------- 1 | """urlwatch monitors webpages for you 2 | 3 | urlwatch is intended to help you watch changes in webpages and get notified 4 | (via e-mail, in your terminal or through various third party services) of any 5 | changes. The change notification will include the URL that has changed and 6 | a unified diff of what has changed. 7 | """ 8 | 9 | pkgname = 'urlwatch' 10 | 11 | __copyright__ = 'Copyright 2008-2019 Thomas Perl' 12 | __author__ = 'Thomas Perl ' 13 | __license__ = 'BSD' 14 | __url__ = 'https://thp.io/2008/urlwatch/' 15 | __version__ = '2.17' 16 | __user_agent__ = '%s/%s (+https://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__) 17 | -------------------------------------------------------------------------------- /test/data/urlwatch.yaml: -------------------------------------------------------------------------------- 1 | display: 2 | error: true 3 | new: true 4 | unchanged: false 5 | report: 6 | email: 7 | enabled: false 8 | from: '' 9 | html: false 10 | method: smtp 11 | sendmail: 12 | path: sendmail 13 | smtp: 14 | host: localhost 15 | auth: true 16 | port: 25 17 | starttls: true 18 | subject: '{count} changes: {jobs}' 19 | to: '' 20 | html: 21 | diff: unified 22 | pushover: 23 | app: '' 24 | device: null 25 | enabled: false 26 | sound: 'spacealarm' 27 | user: '' 28 | stdout: 29 | color: true 30 | enabled: true 31 | text: 32 | details: true 33 | footer: true 34 | line_length: 75 35 | job_defaults: 36 | all: {} 37 | shell: {} 38 | url: {} 39 | browser: {} 40 | -------------------------------------------------------------------------------- /test/data/urls.txt: -------------------------------------------------------------------------------- 1 | 2 | # This is an example urls.txt file for urlwatch 3 | # Empty lines and lines starting with "#" are ignored 4 | 5 | http://www.dubclub-vienna.com/ 6 | http://www.openpandora.org/developers.php 7 | #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/info.html 8 | #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/blatter.html 9 | #http://www.dbai.tuwien.ac.at/education/dbs/current/index.html 10 | #http://www.dbai.tuwien.ac.at/education/dbs/current/uebung.html 11 | http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming 12 | http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming/labor 13 | http://ti.tuwien.ac.at/rts/teaching/courses/betriebssysteme 14 | #http://www.complang.tuwien.ac.at/anton/lvas/effiziente-programme.html 15 | #http://www.complang.tuwien.ac.at/anton/lvas/effizienz-aufgabe08/ 16 | http://www.kukuk.at/ical/events 17 | http://guckes.net/cal/ 18 | 19 | # You can use the pipe character to "watch" the output of shell commands 20 | |ls -al ~ 21 | 22 | # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20) 23 | http://example.org/With%20Spaces/ 24 | 25 | # You can do POST requests by writing the POST data behind the URL, 26 | # separated by a single space character. POST data is URL-encoded. 27 | http://example.com/search.cgi button=Search&q=something&category=4 28 | 29 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2019 Thomas Perl 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 3. The name of the author may not be used to endorse or promote products 14 | derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | -------------------------------------------------------------------------------- /test/test_filters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import yaml 4 | from urlwatch.filters import FilterBase 5 | from nose.tools import eq_ 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def test_filters(): 11 | def check_filter(test_name): 12 | filter = filter_tests[test_name]['filter'] 13 | data = filter_tests[test_name]['data'] 14 | expected_result = filter_tests[test_name]['expected_result'] 15 | if isinstance(filter, dict): 16 | key = next(iter(filter)) 17 | kind, subfilter = key, filter[key] 18 | elif isinstance(filter, str): 19 | if ',' in filter: 20 | raise ValueError('Only single filter allowed in this test') 21 | elif ':' in filter: 22 | kind, subfilter = filter.split(':', 1) 23 | else: 24 | kind = filter 25 | subfilter = None 26 | logger.info('filter kind: %s, subfilter: %s', kind, subfilter) 27 | filtercls = FilterBase.__subclasses__.get(kind) 28 | if filtercls is None: 29 | raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter)) 30 | result = filtercls(None, None).filter(data, subfilter) 31 | logger.debug('Expected result:\n%s', expected_result) 32 | logger.debug('Actual result:\n%s', result) 33 | eq_(result, expected_result) 34 | 35 | with open(os.path.join(os.path.dirname(__file__), 'data/filter_tests.yaml'), 'r', encoding='utf8') as fp: 36 | filter_tests = yaml.load(fp, Loader=yaml.SafeLoader) 37 | for test_name in filter_tests: 38 | yield check_filter, test_name 39 | -------------------------------------------------------------------------------- /share/urlwatch/examples/urls.yaml.example: -------------------------------------------------------------------------------- 1 | # This is an example urls.yaml file for urlwatch 2 | 3 | # A basic URL job just needs a URL 4 | name: "urlwatch webpage" 5 | url: "https://thp.io/2008/urlwatch/" 6 | # You can use a pre-supplied filter for this, here we apply two: 7 | # the html2text filter that converts the HTML to plaintext and 8 | # the grep filter that filters lines based on a regular expression 9 | filter: html2text,grep:Current.*version,strip 10 | --- 11 | # Built-in job kind "shell" needs a command specified 12 | name: "Home Listing" 13 | command: "ls -al ~" 14 | #--- 15 | #name: "Login to some webpage (custom job)" 16 | #url: "http://example.org/" 17 | # This job kind is defined in hooks.py, so you need to enable it 18 | #kind: custom-login 19 | # Additional parameters for the custom-login job kind can be specified here 20 | #username: "myuser" 21 | #password: "secret" 22 | # Filters can be specified here, separated by comma (these are also from hooks.py) 23 | #filter: case:upper,indent:5 24 | --- 25 | # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20) 26 | url: "http://example.org/With%20Spaces/" 27 | --- 28 | # POST requests are done by providing a post parameter 29 | url: "http://example.com/search.cgi" 30 | data: "button=Search&q=something&category=4" 31 | --- 32 | # You can use a custom HTTP method, this might be useful for cache invalidation 33 | url: "http://example.com/foo" 34 | method: "PURGE" 35 | --- 36 | # You can do POST requests by providing data parameter. 37 | # POST data can be a URL-encoded string (see last example) or a dict. 38 | url: "http://example.com/search.cgi" 39 | data: 40 | button: Search 41 | q: something 42 | category: 4 43 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup 4 | from distutils import cmd 5 | 6 | import os 7 | import re 8 | import sys 9 | 10 | main_py = open(os.path.join('lib', 'urlwatch', '__init__.py')).read() 11 | m = dict(re.findall("\n__([a-z]+)__ = '([^']+)'", main_py)) 12 | docs = re.findall('"""(.*?)"""', main_py, re.DOTALL) 13 | 14 | if sys.version_info < (3, 3): 15 | sys.exit('urlwatch requires Python 3.3 or newer') 16 | 17 | m['name'] = 'urlwatch' 18 | m['author'], m['author_email'] = re.match(r'(.*) <(.*)>', m['author']).groups() 19 | m['description'], m['long_description'] = docs[0].strip().split('\n\n', 1) 20 | m['install_requires'] = ['minidb', 'PyYAML', 'requests', 'keyring', 'pycodestyle', 'appdirs', 'lxml', 'cssselect', 'pytesseract', 'Pillow'] 21 | if sys.version_info < (3, 4): 22 | m['install_requires'].extend(['enum34']) 23 | if sys.platform == 'win32': 24 | m['install_requires'].extend(['colorama']) 25 | m['entry_points'] = {"console_scripts": ["urlwatch=urlwatch.cli:main"]} 26 | m['package_dir'] = {'': 'lib'} 27 | m['packages'] = ['urlwatch'] 28 | m['python_requires'] = '>3.3.0' 29 | m['data_files'] = [ 30 | ('share/man/man1', ['share/man/man1/urlwatch.1']), 31 | ('share/urlwatch/examples', [ 32 | 'share/urlwatch/examples/hooks.py.example', 33 | 'share/urlwatch/examples/urls.yaml.example', 34 | ]), 35 | ] 36 | 37 | 38 | class InstallDependencies(cmd.Command): 39 | """Install dependencies only""" 40 | 41 | description = 'Only install required packages using pip' 42 | user_options = [] 43 | 44 | def initialize_options(self): 45 | ... 46 | 47 | def finalize_options(self): 48 | ... 49 | 50 | def run(self): 51 | global m 52 | try: 53 | from pip._internal import main 54 | except ImportError: 55 | from pip import main 56 | try: 57 | main(['install', '--upgrade'] + m['install_requires']) 58 | except TypeError: # recent pip 59 | main.main(['install', '--upgrade'] + m['install_requires']) 60 | 61 | 62 | m['cmdclass'] = {'install_dependencies': InstallDependencies} 63 | 64 | del m['copyright'] 65 | setup(**m) 66 | -------------------------------------------------------------------------------- /share/man/man1/urlwatch.1: -------------------------------------------------------------------------------- 1 | .TH URLWATCH "1" "January 2019" "urlwatch 2.16" "User Commands" 2 | .SH NAME 3 | urlwatch \- monitors webpages for you 4 | .SH SYNOPSIS 5 | .B urlwatch [options] 6 | .SH DESCRIPTION 7 | urlwatch is intended to help you watch changes in webpages and get notified 8 | (via e\-mail, in your terminal or through various third party services) of any 9 | changes. The change notification will include the URL that has changed and 10 | a unified diff of what has changed. 11 | .SS "optional arguments:" 12 | .TP 13 | \fB\-h\fR, \fB\-\-help\fR 14 | show this help message and exit 15 | .TP 16 | \fB\-\-version\fR 17 | show program's version number and exit 18 | .TP 19 | \fB\-v\fR, \fB\-\-verbose\fR 20 | show debug output 21 | .SS "files and directories:" 22 | .TP 23 | \fB\-\-urls\fR FILE 24 | read job list (URLs) from FILE 25 | .TP 26 | \fB\-\-config\fR FILE 27 | read configuration from FILE 28 | .TP 29 | \fB\-\-hooks\fR FILE 30 | use FILE as hooks.py module 31 | .TP 32 | \fB\-\-cache\fR FILE 33 | use FILE as cache database 34 | .SS "Authentication:" 35 | .TP 36 | \fB\-\-smtp\-login\fR 37 | Enter password for SMTP (store in keyring) 38 | .TP 39 | \fB\-\-telegram\-chats\fR 40 | List telegram chats the bot is joined to 41 | .TP 42 | \fB\-\-test\-slack\fR 43 | Send a test notification to Slack 44 | .SS "job list management:" 45 | .TP 46 | \fB\-\-list\fR 47 | list jobs 48 | .TP 49 | \fB\-\-add\fR JOB 50 | add job (key1=value1,key2=value2,...) 51 | .TP 52 | \fB\-\-delete\fR JOB 53 | delete job by location or index 54 | .TP 55 | \fB\-\-test\-filter\fR JOB 56 | test filter output of job by location or index 57 | .SS "interactive commands ($EDITOR/$VISUAL):" 58 | .TP 59 | \fB\-\-edit\fR 60 | edit URL/job list 61 | .TP 62 | \fB\-\-edit\-config\fR 63 | edit configuration file 64 | .TP 65 | \fB\-\-edit\-hooks\fR 66 | edit hooks script 67 | .SS "miscellaneous:" 68 | .TP 69 | \fB\-\-features\fR 70 | list supported jobs/filters/reporters 71 | .TP 72 | \fB\-\-gc\-cache\fR 73 | remove old cache entries 74 | .SH "FILES" 75 | .TP 76 | .B $XDG_CONFIG_HOME/urlwatch/urls.yaml 77 | A list of URLs, commands and other jobs to watch 78 | .TP 79 | .B $XDG_CONFIG_HOME/urlwatch/hooks.py 80 | A Python module that can implement new job types, filters and reporters 81 | .TP 82 | .B $XDG_CACHE_HOME/urlwatch/cache.db 83 | A SQLite 3 database that contains the state history of jobs (for diffing) 84 | .SH AUTHOR 85 | Thomas Perl 86 | .SH WEBSITE 87 | https://thp.io/2008/urlwatch/ 88 | -------------------------------------------------------------------------------- /lib/urlwatch/ical2txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | def ical2text(ical_string): 32 | import vobject 33 | result = [] 34 | if isinstance(ical_string, str): 35 | parsedCal = vobject.readOne(ical_string) 36 | else: 37 | try: 38 | parsedCal = vobject.readOne(ical_string) 39 | except Exception as e: 40 | parsedCal = vobject.readOne(ical_string.decode('utf-8', 'ignore')) 41 | 42 | for event in parsedCal.getChildren(): 43 | if event.name == 'VEVENT': 44 | if hasattr(event, 'dtstart'): 45 | start = event.dtstart.value.strftime('%F %H:%M') 46 | else: 47 | start = 'unknown start date' 48 | 49 | if hasattr(event, 'dtend'): 50 | end = event.dtend.value.strftime('%F %H:%M') 51 | else: 52 | end = start 53 | 54 | if start == end: 55 | date_str = start 56 | else: 57 | date_str = '%s -- %s' % (start, end) 58 | 59 | result.append('%s: %s' % (date_str, event.summary.value)) 60 | 61 | return '\n'.join(result) 62 | -------------------------------------------------------------------------------- /lib/urlwatch/migration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import logging 32 | import os.path 33 | import sys 34 | 35 | from .util import atomic_rename 36 | from .storage import UrlsYaml, UrlsTxt, CacheDirStorage 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | def migrate_urls(urlwatcher): 42 | # Migrate urlwatch 1.x URLs to urlwatch 2.x 43 | 44 | urlwatch_config = urlwatcher.urlwatch_config 45 | pkgname = urlwatch_config.pkgname 46 | urls = urlwatch_config.urls 47 | urls_txt = os.path.join(urlwatch_config.urlwatch_dir, 'urls.txt') 48 | edit = urlwatch_config.edit 49 | add = urlwatch_config.add 50 | features = urlwatch_config.features 51 | edit_hooks = urlwatch_config.edit_hooks 52 | edit_config = urlwatch_config.edit_config 53 | gc_cache = urlwatch_config.gc_cache 54 | 55 | if os.path.isfile(urls_txt) and not os.path.isfile(urls): 56 | print(""" 57 | Migrating URLs: {urls_txt} -> {urls_yaml} 58 | Use "{pkgname} --edit" to customize it. 59 | """.format(urls_txt=urls_txt, urls_yaml=urls, pkgname=pkgname)) 60 | UrlsYaml(urls).save(UrlsTxt(urls_txt).load_secure()) 61 | atomic_rename(urls_txt, urls_txt + '.migrated') 62 | 63 | if not any([os.path.isfile(urls), edit, add, features, edit_hooks, edit_config, gc_cache]): 64 | print(""" 65 | You need to create {urls_yaml} in order to use {pkgname}. 66 | Use "{pkgname} --edit" to open the file with your editor. 67 | """.format(urls_yaml=urls, pkgname=pkgname)) 68 | sys.exit(1) 69 | 70 | 71 | def migrate_cache(urlwatcher): 72 | # Migrate urlwatch 1.x cache to urlwatch 2.x 73 | 74 | urlwatch_config = urlwatcher.urlwatch_config 75 | cache = urlwatch_config.cache 76 | cache_dir = os.path.join(urlwatch_config.urlwatch_dir, 'cache') 77 | 78 | # On Windows and macOS with case-insensitive filesystems, we have to check if 79 | # "cache.db" exists in the folder, and in this case, avoid migration (Issue #223) 80 | if os.path.isdir(cache_dir) and not os.path.isfile(os.path.join(cache_dir, 'cache.db')): 81 | print(""" 82 | Migrating cache: {cache_dir} -> {cache_db} 83 | """.format(cache_dir=cache_dir, cache_db=cache)) 84 | 85 | old_cache_storage = CacheDirStorage(cache_dir) 86 | urlwatcher.cache_storage.restore(old_cache_storage.backup()) 87 | urlwatcher.cache_storage.gc([job.get_guid() for job in urlwatcher.jobs]) 88 | atomic_rename(cache_dir, cache_dir + '.migrated') 89 | -------------------------------------------------------------------------------- /lib/urlwatch/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import imp 32 | import logging 33 | import os 34 | 35 | from .handler import Report 36 | from .worker import run_jobs 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class Urlwatch(object): 42 | def __init__(self, urlwatch_config, config_storage, cache_storage, urls_storage): 43 | 44 | self.urlwatch_config = urlwatch_config 45 | 46 | logger.info('Using %s as URLs file', self.urlwatch_config.urls) 47 | logger.info('Using %s for hooks', self.urlwatch_config.hooks) 48 | logger.info('Using %s as cache database', self.urlwatch_config.cache) 49 | 50 | self.config_storage = config_storage 51 | self.cache_storage = cache_storage 52 | self.urls_storage = urls_storage 53 | 54 | self.report = Report(self) 55 | self.jobs = None 56 | 57 | self.check_directories() 58 | 59 | if hasattr(self.urlwatch_config, 'migrate_urls'): 60 | self.urlwatch_config.migrate_urls(self) 61 | 62 | if not self.urlwatch_config.edit_hooks: 63 | self.load_hooks() 64 | 65 | if not self.urlwatch_config.edit: 66 | self.load_jobs() 67 | 68 | if hasattr(self.urlwatch_config, 'migrate_urls'): 69 | self.urlwatch_config.migrate_cache(self) 70 | 71 | def check_directories(self): 72 | if not os.path.isdir(self.urlwatch_config.urlwatch_dir): 73 | os.makedirs(self.urlwatch_config.urlwatch_dir) 74 | if not os.path.exists(self.urlwatch_config.config): 75 | self.config_storage.write_default_config(self.urlwatch_config.config) 76 | print(""" 77 | A default config has been written to {config_yaml}. 78 | Use "{pkgname} --edit-config" to customize it. 79 | """.format(config_yaml=self.urlwatch_config.config, pkgname=self.urlwatch_config.pkgname)) 80 | 81 | def load_hooks(self): 82 | if os.path.exists(self.urlwatch_config.hooks): 83 | imp.load_source('hooks', self.urlwatch_config.hooks) 84 | 85 | def load_jobs(self): 86 | if os.path.isfile(self.urlwatch_config.urls): 87 | jobs = self.urls_storage.load_secure() 88 | logger.info('Found {0} jobs'.format(len(jobs))) 89 | else: 90 | logger.warn('No jobs file found') 91 | jobs = [] 92 | 93 | self.jobs = jobs 94 | 95 | def run_jobs(self): 96 | run_jobs(self) 97 | 98 | def close(self): 99 | self.report.finish() 100 | self.cache_storage.close() 101 | -------------------------------------------------------------------------------- /lib/urlwatch/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import logging 32 | import os 33 | import platform 34 | import subprocess 35 | import shlex 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | class TrackSubClasses(type): 41 | """A metaclass that stores subclass name-to-class mappings in the base class""" 42 | 43 | @staticmethod 44 | def sorted_by_kind(cls): 45 | return [item for _, item in sorted((it.__kind__, it) for it in cls.__subclasses__.values())] 46 | 47 | def __init__(cls, name, bases, namespace): 48 | for base in bases: 49 | if base == object: 50 | continue 51 | 52 | for attr in ('__required__', '__optional__'): 53 | if not hasattr(base, attr): 54 | continue 55 | 56 | inherited = getattr(base, attr, ()) 57 | new_value = tuple(namespace.get(attr, ())) + tuple(inherited) 58 | namespace[attr] = new_value 59 | setattr(cls, attr, new_value) 60 | 61 | for base in bases: 62 | if base == object: 63 | continue 64 | 65 | if hasattr(cls, '__kind__'): 66 | subclasses = getattr(base, '__subclasses__', None) 67 | if subclasses is not None: 68 | logger.info('Registering %r as %s', cls, cls.__kind__) 69 | subclasses[cls.__kind__] = cls 70 | break 71 | else: 72 | anonymous_subclasses = getattr(base, '__anonymous_subclasses__', None) 73 | if anonymous_subclasses is not None: 74 | logger.info('Registering %r', cls) 75 | anonymous_subclasses.append(cls) 76 | break 77 | 78 | super().__init__(name, bases, namespace) 79 | 80 | 81 | def atomic_rename(old_filename, new_filename): 82 | if platform.system() == 'Windows' and os.path.exists(new_filename): 83 | new_old_filename = new_filename + '.bak' 84 | if os.path.exists(new_old_filename): 85 | os.remove(new_old_filename) 86 | os.rename(new_filename, new_old_filename) 87 | os.rename(old_filename, new_filename) 88 | if os.path.exists(new_old_filename): 89 | os.remove(new_old_filename) 90 | else: 91 | os.rename(old_filename, new_filename) 92 | 93 | 94 | def edit_file(filename): 95 | editor = os.environ.get('EDITOR', None) 96 | if not editor: 97 | editor = os.environ.get('VISUAL', None) 98 | if not editor: 99 | raise SystemExit('Please set $VISUAL or $EDITOR.') 100 | 101 | subprocess.check_call(shlex.split(editor) + [filename]) 102 | -------------------------------------------------------------------------------- /share/urlwatch/examples/hooks.py.example: -------------------------------------------------------------------------------- 1 | # 2 | # Example hooks file for urlwatch 3 | # 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 1. Redistributions of source code must retain the above copyright 11 | # notice, this list of conditions and the following disclaimer. 12 | # 2. Redistributions in binary form must reproduce the above copyright 13 | # notice, this list of conditions and the following disclaimer in the 14 | # documentation and/or other materials provided with the distribution. 15 | # 3. The name of the author may not be used to endorse or promote products 16 | # derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | # 29 | 30 | import re 31 | 32 | from urlwatch import filters 33 | from urlwatch import jobs 34 | from urlwatch import reporters 35 | 36 | 37 | #class CustomLoginJob(jobs.UrlJob): 38 | # """Custom login for my webpage""" 39 | # 40 | # __kind__ = 'custom-login' 41 | # __required__ = ('username', 'password') 42 | # 43 | # def retrieve(self, job_state): 44 | # return 'Would log in to {} with {} and {}\n'.format(self.url, self.username, self.password) 45 | 46 | 47 | #class CaseFilter(filters.FilterBase): 48 | # """Custom filter for changing case, needs to be selected manually""" 49 | # 50 | # __kind__ = 'case' 51 | # 52 | # def filter(self, data, subfilter=None): 53 | # # The subfilter is specified using a colon, for example the "case" 54 | # # filter here can be specified as "case:upper" and "case:lower" 55 | # 56 | # if subfilter is None: 57 | # subfilter = 'upper' 58 | # 59 | # if subfilter == 'upper': 60 | # return data.upper() 61 | # elif subfilter == 'lower': 62 | # return data.lower() 63 | # else: 64 | # raise ValueError('Unknown case subfilter: %r' % (subfilter,)) 65 | 66 | 67 | #class IndentFilter(filters.FilterBase): 68 | # """Custom filter for indenting, needs to be selected manually""" 69 | # 70 | # __kind__ = 'indent' 71 | # 72 | # def filter(self, data, subfilter=None): 73 | # # The subfilter here is a number of characters to indent 74 | # 75 | # if subfilter is None: 76 | # indent = 8 77 | # else: 78 | # indent = int(subfilter) 79 | # 80 | # return '\n'.join((' '*indent) + line for line in data.splitlines()) 81 | 82 | 83 | 84 | class CustomMatchUrlFilter(filters.AutoMatchFilter): 85 | # The AutoMatchFilter will apply automatically to all filters 86 | # that have the given properties set 87 | MATCH = {'url': 'http://example.org/'} 88 | 89 | def filter(self, data): 90 | return data.replace('foo', 'bar') 91 | 92 | class CustomRegexMatchUrlFilter(filters.RegexMatchFilter): 93 | # Similar to AutoMatchFilter 94 | MATCH = {'url': re.compile('http://example.org/.*')} 95 | 96 | def filter(self, data): 97 | return data.replace('foo', 'bar') 98 | 99 | 100 | class CustomTextFileReporter(reporters.TextReporter): 101 | """Custom reporter that writes the text-only report to a file""" 102 | 103 | __kind__ = 'custom_file' 104 | 105 | def submit(self): 106 | with open(self.config['filename'], 'w') as fp: 107 | fp.write('\n'.join(super().submit())) 108 | 109 | 110 | class CustomHtmlFileReporter(reporters.HtmlReporter): 111 | """Custom reporter that writes the HTML report to a file""" 112 | 113 | __kind__ = 'custom_html' 114 | 115 | def submit(self): 116 | with open(self.config['filename'], 'w') as fp: 117 | fp.write('\n'.join(super().submit())) 118 | -------------------------------------------------------------------------------- /lib/urlwatch/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 5 | # Copyright (c) 2008-2019 Thomas Perl 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted provided that the following conditions 10 | # are met: 11 | # 12 | # 1. Redistributions of source code must retain the above copyright 13 | # notice, this list of conditions and the following disclaimer. 14 | # 2. Redistributions in binary form must reproduce the above copyright 15 | # notice, this list of conditions and the following disclaimer in the 16 | # documentation and/or other materials provided with the distribution. 17 | # 3. The name of the author may not be used to endorse or promote products 18 | # derived from this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | 32 | # File and folder paths 33 | import logging 34 | import os.path 35 | import signal 36 | import socket 37 | import sys 38 | 39 | from appdirs import AppDirs 40 | 41 | pkgname = 'urlwatch' 42 | urlwatch_dir = os.path.expanduser(os.path.join('~', '.' + pkgname)) 43 | urlwatch_cache_dir = AppDirs(pkgname).user_cache_dir 44 | 45 | if not os.path.exists(urlwatch_dir): 46 | urlwatch_dir = AppDirs(pkgname).user_config_dir 47 | 48 | # Check if we are installed in the system already 49 | (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0]))) 50 | 51 | if bindir != 'bin': 52 | sys.path.insert(0, os.path.join(prefix, bindir, 'lib')) 53 | 54 | from urlwatch.command import UrlwatchCommand 55 | from urlwatch.config import CommandConfig 56 | from urlwatch.main import Urlwatch 57 | from urlwatch.storage import YamlConfigStorage, CacheMiniDBStorage, UrlsYaml 58 | 59 | # One minute (=60 seconds) timeout for each request to avoid hanging 60 | socket.setdefaulttimeout(60) 61 | 62 | # Ignore SIGPIPE for stdout (see https://github.com/thp/urlwatch/issues/77) 63 | try: 64 | signal.signal(signal.SIGPIPE, signal.SIG_DFL) 65 | except AttributeError: 66 | # Windows does not have signal.SIGPIPE 67 | ... 68 | 69 | logger = logging.getLogger(pkgname) 70 | 71 | CONFIG_FILE = 'urlwatch.yaml' 72 | URLS_FILE = 'urls.yaml' 73 | CACHE_FILE = 'cache.db' 74 | HOOKS_FILE = 'hooks.py' 75 | 76 | 77 | def setup_logger(verbose): 78 | if verbose: 79 | root_logger = logging.getLogger('') 80 | console = logging.StreamHandler() 81 | console.setFormatter(logging.Formatter('%(asctime)s %(module)s %(levelname)s: %(message)s')) 82 | root_logger.addHandler(console) 83 | root_logger.setLevel(logging.DEBUG) 84 | root_logger.info('turning on verbose logging mode') 85 | 86 | 87 | def main(): 88 | config_file = os.path.join(urlwatch_dir, CONFIG_FILE) 89 | urls_file = os.path.join(urlwatch_dir, URLS_FILE) 90 | hooks_file = os.path.join(urlwatch_dir, HOOKS_FILE) 91 | new_cache_file = os.path.join(urlwatch_cache_dir, CACHE_FILE) 92 | old_cache_file = os.path.join(urlwatch_dir, CACHE_FILE) 93 | cache_file = new_cache_file 94 | if os.path.exists(old_cache_file) and not os.path.exists(new_cache_file): 95 | cache_file = old_cache_file 96 | 97 | command_config = CommandConfig(pkgname, urlwatch_dir, bindir, prefix, 98 | config_file, urls_file, hooks_file, cache_file, False) 99 | setup_logger(command_config.verbose) 100 | 101 | # setup storage API 102 | config_storage = YamlConfigStorage(command_config.config) 103 | cache_storage = CacheMiniDBStorage(command_config.cache) 104 | urls_storage = UrlsYaml(command_config.urls) 105 | 106 | # setup urlwatcher 107 | urlwatch = Urlwatch(command_config, config_storage, cache_storage, urls_storage) 108 | urlwatch_command = UrlwatchCommand(urlwatch) 109 | 110 | # run urlwatcher 111 | urlwatch_command.run() 112 | 113 | 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /lib/urlwatch/mailer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import smtplib 32 | import getpass 33 | import subprocess 34 | import logging 35 | 36 | try: 37 | import keyring 38 | except ImportError: 39 | keyring = None 40 | 41 | import email.mime.multipart 42 | import email.mime.text 43 | import email.utils 44 | 45 | logger = logging.getLogger(__name__) 46 | 47 | 48 | class Mailer(object): 49 | def send(self, msg): 50 | raise NotImplementedError 51 | 52 | def msg_plain(self, from_email, to_email, subject, body): 53 | msg = email.mime.text.MIMEText(body, 'plain', 'utf-8') 54 | msg['Subject'] = subject 55 | msg['From'] = from_email 56 | msg['To'] = to_email 57 | msg['Date'] = email.utils.formatdate() 58 | 59 | return msg 60 | 61 | def msg_html(self, from_email, to_email, subject, body_text, body_html): 62 | msg = email.mime.multipart.MIMEMultipart('alternative') 63 | msg['Subject'] = subject 64 | msg['From'] = from_email 65 | msg['To'] = to_email 66 | msg['Date'] = email.utils.formatdate() 67 | 68 | msg.attach(email.mime.text.MIMEText(body_text, 'plain', 'utf-8')) 69 | msg.attach(email.mime.text.MIMEText(body_html, 'html', 'utf-8')) 70 | 71 | return msg 72 | 73 | 74 | class SMTPMailer(Mailer): 75 | def __init__(self, smtp_user, smtp_server, smtp_port, tls, auth, insecure_password=None): 76 | self.smtp_server = smtp_server 77 | self.smtp_user = smtp_user 78 | self.smtp_port = smtp_port 79 | self.tls = tls 80 | self.auth = auth 81 | self.insecure_password = insecure_password 82 | 83 | def send(self, msg): 84 | s = smtplib.SMTP(self.smtp_server, self.smtp_port) 85 | s.ehlo() 86 | 87 | if self.tls: 88 | s.starttls() 89 | 90 | if self.auth: 91 | if self.insecure_password: 92 | passwd = self.insecure_password 93 | elif keyring is not None: 94 | passwd = keyring.get_password(self.smtp_server, self.smtp_user) 95 | if passwd is None: 96 | raise ValueError('No password available in keyring for {}, {}'.format(self.smtp_server, self.smtp_user)) 97 | s.login(self.smtp_user, passwd) 98 | 99 | s.sendmail(msg['From'], msg['To'].split(','), msg.as_string()) 100 | s.quit() 101 | 102 | 103 | class SendmailMailer(Mailer): 104 | def __init__(self, sendmail_path): 105 | self.sendmail_path = sendmail_path 106 | 107 | def send(self, msg): 108 | p = subprocess.Popen([self.sendmail_path, '-oi', msg['To']], 109 | stdin=subprocess.PIPE, 110 | stderr=subprocess.PIPE, 111 | universal_newlines=True) 112 | result = p.communicate(msg.as_string()) 113 | if p.returncode: 114 | logger.error('Sendmail failed with {result}'.format(result=result)) 115 | 116 | 117 | def have_password(smtp_server, from_email): 118 | return keyring.get_password(smtp_server, from_email) is not None 119 | 120 | 121 | def set_password(smtp_server, from_email): 122 | ''' Set the keyring password for the mail connection. Interactive.''' 123 | if keyring is None: 124 | raise ImportError('keyring module missing - service unsupported') 125 | 126 | password = getpass.getpass(prompt='Enter password for {} using {}: '.format(from_email, smtp_server)) 127 | keyring.set_password(smtp_server, from_email, password) 128 | -------------------------------------------------------------------------------- /lib/urlwatch/worker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import concurrent.futures 32 | import logging 33 | import difflib 34 | 35 | import requests 36 | 37 | from .handler import JobState 38 | from .jobs import NotModifiedError 39 | 40 | logger = logging.getLogger(__name__) 41 | 42 | MAX_WORKERS = 10 43 | 44 | 45 | def run_parallel(func, items): 46 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) 47 | for future in concurrent.futures.as_completed(executor.submit(func, item) for item in items): 48 | exception = future.exception() 49 | if exception is not None: 50 | raise exception 51 | yield future.result() 52 | 53 | 54 | def run_jobs(urlwatcher): 55 | cache_storage = urlwatcher.cache_storage 56 | jobs = [job.with_defaults(urlwatcher.config_storage.config) 57 | for job in urlwatcher.jobs] 58 | report = urlwatcher.report 59 | 60 | resources = {} 61 | for job in jobs: 62 | job.request_resources(resources) 63 | 64 | logger.debug('Processing %d jobs', len(jobs)) 65 | for job_state in run_parallel(lambda job_state: job_state.process(), 66 | (JobState(cache_storage, resources, job) for job in jobs)): 67 | logger.debug('Job finished: %s', job_state.job) 68 | 69 | if not job_state.job.max_tries: 70 | max_tries = 0 71 | else: 72 | max_tries = job_state.job.max_tries 73 | logger.debug('Using max_tries of %i for %s', max_tries, job_state.job) 74 | 75 | if job_state.exception is not None: 76 | if job_state.error_ignored: 77 | logger.info('Error while executing job %s ignored due to job config', job_state.job) 78 | elif isinstance(job_state.exception, NotModifiedError): 79 | logger.info('Job %s has not changed (HTTP 304)', job_state.job) 80 | report.unchanged(job_state) 81 | if job_state.tries > 0: 82 | job_state.tries = 0 83 | job_state.save() 84 | elif job_state.tries < max_tries: 85 | logger.debug('This was try %i of %i for job %s', job_state.tries, 86 | max_tries, job_state.job) 87 | job_state.save() 88 | elif job_state.tries >= max_tries: 89 | logger.debug('We are now at %i tries ', job_state.tries) 90 | job_state.save() 91 | report.error(job_state) 92 | 93 | elif job_state.old_data is not None: 94 | matched_history_time = job_state.history_data.get(job_state.new_data) 95 | if matched_history_time: 96 | job_state.timestamp = matched_history_time 97 | if matched_history_time or job_state.new_data == job_state.old_data: 98 | report.unchanged(job_state) 99 | if job_state.tries > 0: 100 | job_state.tries = 0 101 | job_state.save() 102 | else: 103 | close_matches = difflib.get_close_matches(job_state.new_data, job_state.history_data, n=1) 104 | if close_matches: 105 | job_state.old_data = close_matches[0] 106 | job_state.timestamp = job_state.history_data[close_matches[0]] 107 | report.changed(job_state) 108 | job_state.tries = 0 109 | job_state.save() 110 | else: 111 | report.new(job_state) 112 | job_state.tries = 0 113 | job_state.save() 114 | 115 | for job in jobs: 116 | job.release_resources(resources) 117 | -------------------------------------------------------------------------------- /lib/urlwatch/html2txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import re 32 | import os 33 | import subprocess 34 | import logging 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | def html2text(data, method, options): 40 | """ 41 | Convert a string consisting of HTML to plain text 42 | for easy difference checking. 43 | 44 | Method may be one of: 45 | 'lynx' - Use "lynx -dump" for conversion 46 | options: see "lynx -help" output for options that work with "-dump" 47 | 'html2text' - Use "html2text -nobs" for conversion 48 | options: https://linux.die.net/man/1/html2text 49 | 'bs4' - Use Beautiful Soup library to prettify the HTML 50 | options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser" 51 | http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use 52 | 're' - A simple regex-based HTML tag stripper 53 | 'pyhtml2text' - Use Python module "html2text" 54 | options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options 55 | """ 56 | if method == 're': 57 | stripped_tags = re.sub(r'<[^>]*>', '', data) 58 | d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) 59 | return d 60 | 61 | if method == 'pyhtml2text': 62 | import html2text 63 | parser = html2text.HTML2Text() 64 | for k, v in options.items(): 65 | setattr(parser, k.lower(), v) 66 | d = parser.handle(data) 67 | return d 68 | 69 | if method == 'bs4': 70 | from bs4 import BeautifulSoup 71 | parser = options.pop('parser', 'html.parser') 72 | soup = BeautifulSoup(data, parser) 73 | d = soup.get_text(strip=True) 74 | return d 75 | 76 | if method == 'lynx': 77 | cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8'] 78 | elif method == 'html2text': 79 | cmd = ['html2text', '-nobs', '-utf8'] 80 | else: 81 | raise ValueError('Unknown html2text method: %r' % (method,)) 82 | 83 | stdout_encoding = 'utf-8' 84 | 85 | for k, v in options.items(): 86 | cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k) 87 | 88 | logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) 89 | 90 | env = {} 91 | env.update(os.environ) 92 | env['LANG'] = 'en_US.utf-8' 93 | env['LC_ALL'] = 'en_US.utf-8' 94 | 95 | html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) 96 | stdout, stderr = html2text.communicate(data.encode('utf-8')) 97 | stdout = stdout.decode(stdout_encoding) 98 | 99 | if method == 'lynx': 100 | # Lynx translates relative links in the mode we use it to: 101 | # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] 102 | 103 | # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the 104 | # "localhost" in the file:// URLs; see Debian bug 732112 105 | stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) 106 | 107 | # Use the following regular expression to remove the unnecessary 108 | # parts, so that [RANDOM STRING] (changing on each call) does not 109 | # expose itself as change on the website (it's a Lynx-related thing 110 | # Thanks to Evert Meulie for pointing that out 111 | stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) 112 | # Also remove file names like L9816-5928TMP.html 113 | stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) 114 | 115 | return stdout.strip() 116 | -------------------------------------------------------------------------------- /lib/urlwatch/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import argparse 32 | import logging 33 | import os 34 | 35 | import urlwatch 36 | from .migration import migrate_cache, migrate_urls 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class BaseConfig(object): 42 | 43 | def __init__(self, pkgname, urlwatch_dir, config, urls, cache, hooks, verbose): 44 | self.pkgname = pkgname 45 | self.urlwatch_dir = urlwatch_dir 46 | self.config = config 47 | self.urls = urls 48 | self.cache = cache 49 | self.hooks = hooks 50 | self.verbose = verbose 51 | 52 | 53 | class CommandConfig(BaseConfig): 54 | 55 | def __init__(self, pkgname, urlwatch_dir, bindir, prefix, config, urls, hooks, cache, verbose): 56 | super().__init__(pkgname, urlwatch_dir, config, urls, cache, hooks, verbose) 57 | self.bindir = bindir 58 | self.prefix = prefix 59 | self.migrate_cache = migrate_cache 60 | self.migrate_urls = migrate_urls 61 | 62 | if self.bindir == 'bin': 63 | # Installed system-wide 64 | self.examples_dir = os.path.join(prefix, 'share', self.pkgname, 'examples') 65 | else: 66 | # Assume we are not yet installed 67 | self.examples_dir = os.path.join(prefix, bindir, 'share', self.pkgname, 'examples') 68 | 69 | self.urls_yaml_example = os.path.join(self.examples_dir, 'urls.yaml.example') 70 | self.hooks_py_example = os.path.join(self.examples_dir, 'hooks.py.example') 71 | 72 | self.parse_args() 73 | 74 | def parse_args(self): 75 | 76 | parser = argparse.ArgumentParser(description=urlwatch.__doc__, 77 | formatter_class=argparse.RawDescriptionHelpFormatter) 78 | parser.add_argument('--version', action='version', version='%(prog)s {}'.format(urlwatch.__version__)) 79 | parser.add_argument('-v', '--verbose', action='store_true', help='show debug output') 80 | group = parser.add_argument_group('files and directories') 81 | group.add_argument('--urls', metavar='FILE', help='read job list (URLs) from FILE', 82 | default=self.urls) 83 | group.add_argument('--config', metavar='FILE', help='read configuration from FILE', 84 | default=self.config) 85 | group.add_argument('--hooks', metavar='FILE', help='use FILE as hooks.py module', 86 | default=self.hooks) 87 | group.add_argument('--cache', metavar='FILE', help='use FILE as cache database', 88 | default=self.cache) 89 | 90 | group = parser.add_argument_group('Authentication') 91 | group.add_argument('--smtp-login', action='store_true', help='Enter password for SMTP (store in keyring)') 92 | group.add_argument('--telegram-chats', action='store_true', help='List telegram chats the bot is joined to') 93 | group.add_argument('--test-slack', action='store_true', help='Send a test notification to Slack') 94 | 95 | group = parser.add_argument_group('job list management') 96 | group.add_argument('--list', action='store_true', help='list jobs') 97 | group.add_argument('--add', metavar='JOB', help='add job (key1=value1,key2=value2,...)') 98 | group.add_argument('--delete', metavar='JOB', help='delete job by location or index') 99 | group.add_argument('--test-filter', metavar='JOB', help='test filter output of job by location or index') 100 | group = parser.add_argument_group('interactive commands ($EDITOR/$VISUAL)') 101 | group.add_argument('--edit', action='store_true', help='edit URL/job list') 102 | group.add_argument('--edit-config', action='store_true', help='edit configuration file') 103 | group.add_argument('--edit-hooks', action='store_true', help='edit hooks script') 104 | group = parser.add_argument_group('miscellaneous') 105 | group.add_argument('--features', action='store_true', help='list supported jobs/filters/reporters') 106 | group.add_argument('--gc-cache', action='store_true', help='remove old cache entries') 107 | group.add_argument('--report-timestamps', action='store_true', help='report job timestamps') 108 | 109 | args = parser.parse_args() 110 | 111 | for i, arg in enumerate(vars(args)): 112 | argval = getattr(args, arg) 113 | setattr(self, arg, argval) 114 | -------------------------------------------------------------------------------- /test/data/filter_tests.yaml: -------------------------------------------------------------------------------- 1 | # : 2 | # filter: 3 | # data: | 4 | # Input data as block scalar (string). 5 | # Use the literal style (starts with "|") for better readability. 6 | # Use a chomping indicator (-/+) to control trailing newlines. 7 | # Ref: 8 | # https://yaml.org/spec/1.2/spec.html#id2795688 9 | # https://yaml.org/spec/1.2/spec.html#id2794534 10 | # expected_result: | 11 | # 12 | element_by_tag: 13 | filter: element-by-tag:body 14 | data: | 15 | foo 16 | expected_result: |- 17 | foo 18 | element_by_tag_nested: 19 | filter: element-by-tag:div 20 | data: | 21 | 22 |
foo
23 |
bar
24 | 25 | expected_result: |- 26 |
foo
bar
27 | element_by_id: 28 | filter: element-by-id:bar 29 | data: | 30 | 31 |
asdf bar
32 |
asdf bar hoho
33 | 34 | expected_result: |- 35 |
asdf bar hoho
36 | element_by_class: 37 | filter: element-by-class:foo 38 | data: | 39 | 40 |
foo
41 |
bar
42 | 43 | expected_result: |- 44 |
foo
45 | xpath_elements: 46 | filter: xpath://div | //*[@id="bar"] 47 | data: | 48 | 49 |
foo
50 |
bar
51 | 52 | expected_result: | 53 |
foo
54 | 55 |
bar
56 | xpath_text: 57 | filter: xpath://div[1]/text() | //div[2]/@id 58 | data: | 59 | 60 |
foo
61 |
bar
62 | 63 | expected_result: |- 64 | foo 65 | bar 66 | xpath_exclude: 67 | filter: 68 | xpath: 69 | path: //div 70 | exclude: //*[@class='excl'] | //*/@class 71 | data: | 72 | 73 |
you don't want to see me
74 |
finterrupt!ointerrupt!o
75 |
bar
76 | 77 | expected_result: | 78 |
foo
79 | 80 |
bar
81 | xpath_xml_namespaces: 82 | filter: 83 | xpath: 84 | path: //item | //f:item 85 | method: xml 86 | exclude: //f:year | //author 87 | namespaces: 88 | f: foo 89 | data: | 90 | 91 | 92 | 2017 93 | Tom 94 | abc 95 | 96 | 97 | 2018 98 | Jerry 99 | xyz 100 | 101 | 102 | expected_result: | 103 | 104 | 105 | 106 | abc 107 | 108 | 109 | 110 | 2018 111 | Jerry 112 | xyz 113 | 114 | css: 115 | filter: css:div 116 | data: | 117 | 118 |
foo
119 |
bar
120 | 121 | expected_result: | 122 |
foo
123 | 124 |
bar
125 | css_exclude: 126 | filter: 127 | css: 128 | selector: div 129 | exclude: '.excl, #bar' 130 | data: | 131 | 132 |
you don't want to see me
133 |
finterrupt!ointerrupt!o
134 |
bar
135 | 136 | expected_result: | 137 |
foo
138 | css_xml_namespaces: 139 | filter: 140 | css: 141 | selector: item, f|item 142 | method: xml 143 | exclude: f|year, author 144 | namespaces: 145 | f: foo 146 | data: | 147 | 148 | 149 | 2017 150 | Tom 151 | abc 152 | 153 | 154 | 2018 155 | Jerry 156 | xyz 157 | 158 | 159 | expected_result: | 160 | 161 | 162 | 163 | abc 164 | 165 | 166 | 167 | 2018 168 | Jerry 169 | xyz 170 | 171 | grep: 172 | filter: grep:blue 173 | data: | 174 | The rose is red; 175 | the violet's blue. 176 | Sugar is sweet, 177 | and so are you. 178 | expected_result: |- 179 | the violet's blue. 180 | grep_with_comma: 181 | filter: grep:\054 182 | data: | 183 | The rose is red; 184 | the violet's blue. 185 | Sugar is sweet, 186 | and so are you. 187 | expected_result: |- 188 | Sugar is sweet, 189 | json_format: 190 | filter: format-json 191 | data: | 192 | {"field1": {"f1.1": "value"},"field2": "value"} 193 | expected_result: |- 194 | { 195 | "field1": { 196 | "f1.1": "value" 197 | }, 198 | "field2": "value" 199 | } 200 | json_format_subfilter: 201 | filter: format-json:2 202 | data: | 203 | {"field1": {"f1.1": "value"},"field2": "value"} 204 | expected_result: |- 205 | { 206 | "field1": { 207 | "f1.1": "value" 208 | }, 209 | "field2": "value" 210 | } 211 | sha1: 212 | filter: sha1sum 213 | data: 1234567890abcdefg 214 | expected_result: 8417680c09644df743d7cea1366fbe13a31b2d5e 215 | hexdump: 216 | filter: hexdump 217 | data: | 218 | Hello world! 219 | 你好,世界! 220 | expected_result: |- 221 | 48 65 6c 6c 6f 20 77 6f 72 6c 64 21 0a e4 bd a0 Hello world!.... 222 | e5 a5 bd ef bc 8c e4 b8 96 e7 95 8c ef bc 81 0a ................ 223 | -------------------------------------------------------------------------------- /lib/urlwatch/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import datetime 32 | import logging 33 | import time 34 | import traceback 35 | 36 | from .filters import FilterBase 37 | from .jobs import NotModifiedError 38 | from .reporters import ReporterBase 39 | 40 | logger = logging.getLogger(__name__) 41 | 42 | 43 | class JobState(object): 44 | def __init__(self, cache_storage, resources, job): 45 | self.cache_storage = cache_storage 46 | self.resources = resources 47 | self.job = job 48 | self.verb = None 49 | self.old_data = None 50 | self.new_data = None 51 | self.history_data = {} 52 | self.timestamp = None 53 | self.exception = None 54 | self.traceback = None 55 | self.tries = 0 56 | self.etag = None 57 | self.error_ignored = False 58 | 59 | def load(self): 60 | guid = self.job.get_guid() 61 | self.old_data, self.timestamp, self.tries, self.etag = self.cache_storage.load(self.job, guid) 62 | if self.tries is None: 63 | self.tries = 0 64 | if self.job.compared_versions and self.job.compared_versions > 1: 65 | self.history_data = self.cache_storage.get_history_data(guid, self.job.compared_versions) 66 | 67 | def save(self): 68 | if self.new_data is None and self.exception is not None: 69 | # If no new data has been retrieved due to an exception, use the old job data 70 | self.new_data = self.old_data 71 | 72 | self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time(), self.tries, self.etag) 73 | 74 | def process(self): 75 | now_epoch = datetime.datetime.now().strftime('%s') 76 | logger.info('Processing: %s', self.job) 77 | try: 78 | try: 79 | self.load() 80 | data = self.job.retrieve(self) 81 | 82 | # Apply automatic filters first 83 | data = FilterBase.auto_process(self, data) 84 | 85 | # Apply any specified filters 86 | filter_list = self.job.filter 87 | 88 | if filter_list: 89 | if isinstance(filter_list, list): 90 | for item in filter_list: 91 | key = next(iter(item)) 92 | filter_kind, subfilter = key, item[key] 93 | data = FilterBase.process(filter_kind, subfilter, self, data) 94 | elif isinstance(filter_list, str): 95 | for filter_kind in filter_list.split(','): 96 | if ':' in filter_kind: 97 | filter_kind, subfilter = filter_kind.split(':', 1) 98 | else: 99 | subfilter = None 100 | data = FilterBase.process(filter_kind, subfilter, self, data) 101 | self.new_data = data 102 | 103 | except Exception as e: 104 | # job has a chance to format and ignore its error 105 | self.exception = e 106 | self.traceback = self.job.format_error(e, traceback.format_exc()) 107 | self.new_data = "ERROR|%s: %s" % (now_epoch, e) 108 | self.error_ignored = self.job.ignore_error(e) 109 | if not (self.error_ignored or isinstance(e, NotModifiedError)): 110 | self.tries += 1 111 | logger.debug('Increasing number of tries to %i for %s', self.tries, self.job) 112 | except Exception as e: 113 | # job failed its chance to handle error 114 | self.exception = e 115 | self.traceback = traceback.format_exc() 116 | self.new_data = "ERROR|%s: %s" % (now_epoch, e) 117 | self.error_ignored = False 118 | if not isinstance(e, NotModifiedError): 119 | self.tries += 1 120 | logger.debug('Increasing number of tries to %i for %s', self.tries, self.job) 121 | 122 | return self 123 | 124 | 125 | class Report(object): 126 | def __init__(self, urlwatch_config): 127 | self.config = urlwatch_config.config_storage.config 128 | 129 | self.job_states = [] 130 | self.start = datetime.datetime.now() 131 | 132 | def _result(self, verb, job_state): 133 | if job_state.exception is not None: 134 | # TODO: Once we require Python >= 3.5, we can just pass in job_state.exception as "exc_info" parameter 135 | exc_info = (type(job_state.exception), job_state.exception, job_state.exception.__traceback__) 136 | logger.debug('Got exception while processing %r', job_state.job, exc_info=exc_info) 137 | 138 | job_state.verb = verb 139 | self.job_states.append(job_state) 140 | 141 | def new(self, job_state): 142 | self._result('new', job_state) 143 | 144 | def changed(self, job_state): 145 | self._result('changed', job_state) 146 | 147 | def unchanged(self, job_state): 148 | self._result('unchanged', job_state) 149 | 150 | def error(self, job_state): 151 | self._result('error', job_state) 152 | 153 | def get_filtered_job_states(self, job_states): 154 | for job_state in job_states: 155 | if not any(job_state.verb == verb and not self.config['display'][verb] 156 | for verb in ('unchanged', 'new', 'error')): 157 | yield job_state 158 | 159 | def finish(self): 160 | end = datetime.datetime.now() 161 | duration = (end - self.start) 162 | 163 | ReporterBase.submit_all(self, self.job_states, duration) 164 | -------------------------------------------------------------------------------- /test/test_handler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from glob import glob 3 | 4 | import pycodestyle as pycodestyle 5 | from urlwatch.jobs import UrlJob, JobBase, ShellJob 6 | from urlwatch.storage import UrlsYaml, UrlsTxt 7 | 8 | from nose.tools import raises, with_setup 9 | 10 | import tempfile 11 | import os 12 | import imp 13 | 14 | from urlwatch import storage 15 | from urlwatch.config import BaseConfig 16 | from urlwatch.storage import YamlConfigStorage, CacheMiniDBStorage 17 | from urlwatch.main import Urlwatch 18 | 19 | 20 | def test_required_classattrs_in_subclasses(): 21 | for kind, subclass in JobBase.__subclasses__.items(): 22 | assert hasattr(subclass, '__kind__') 23 | assert hasattr(subclass, '__required__') 24 | assert hasattr(subclass, '__optional__') 25 | 26 | 27 | def test_save_load_jobs(): 28 | jobs = [ 29 | UrlJob(name='news', url='http://news.orf.at/'), 30 | ShellJob(name='list homedir', command='ls ~'), 31 | ShellJob(name='list proc', command='ls /proc'), 32 | ] 33 | 34 | # tempfile.NamedTemporaryFile() doesn't work on Windows 35 | # because the returned file object cannot be opened again 36 | fd, name = tempfile.mkstemp() 37 | UrlsYaml(name).save(jobs) 38 | jobs2 = UrlsYaml(name).load() 39 | os.chmod(name, 0o777) 40 | jobs3 = UrlsYaml(name).load_secure() 41 | os.close(fd) 42 | os.remove(name) 43 | 44 | assert len(jobs2) == len(jobs) 45 | # Assert that the shell jobs have been removed due to secure loading 46 | if sys.platform != 'win32': 47 | assert len(jobs3) == 1 48 | 49 | 50 | def test_load_config_yaml(): 51 | config_file = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') 52 | if os.path.exists(config_file): 53 | config = YamlConfigStorage(config_file) 54 | assert config is not None 55 | assert config.config is not None 56 | assert config.config == storage.DEFAULT_CONFIG 57 | 58 | 59 | def test_load_urls_txt(): 60 | urls_txt = os.path.join(os.path.dirname(__file__), 'data', 'urls.txt') 61 | if os.path.exists(urls_txt): 62 | assert len(UrlsTxt(urls_txt).load_secure()) > 0 63 | 64 | 65 | def test_load_urls_yaml(): 66 | urls_yaml = 'share/urlwatch/examples/urls.yaml.example' 67 | if os.path.exists(urls_yaml): 68 | assert len(UrlsYaml(urls_yaml).load_secure()) > 0 69 | 70 | 71 | def test_load_hooks_py(): 72 | hooks_py = 'share/urlwatch/examples/hooks.py.example' 73 | if os.path.exists(hooks_py): 74 | imp.load_source('hooks', hooks_py) 75 | 76 | 77 | def test_pep8_conformance(): 78 | """Test that we conform to PEP-8.""" 79 | style = pycodestyle.StyleGuide(ignore=['E501', 'E402', 'W503']) 80 | 81 | py_files = [y for x in os.walk(os.path.abspath('.')) for y in glob(os.path.join(x[0], '*.py'))] 82 | result = style.check_files(py_files) 83 | assert result.total_errors == 0, "Found #{0} code style errors".format(result.total_errors) 84 | 85 | 86 | class TestConfig(BaseConfig): 87 | def __init__(self, config, urls, cache, hooks, verbose): 88 | (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0]))) 89 | super().__init__('urlwatch', os.path.dirname(__file__), config, urls, cache, hooks, verbose) 90 | self.edit = False 91 | self.edit_hooks = False 92 | 93 | 94 | def teardown_func(): 95 | "tear down test fixtures" 96 | cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') 97 | if os.path.exists(cache): 98 | os.remove(cache) 99 | 100 | 101 | @with_setup(teardown=teardown_func) 102 | def test_run_watcher(): 103 | urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'share', 'urlwatch', 'examples', 'urls.yaml.example') 104 | config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') 105 | cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') 106 | hooks = '' 107 | 108 | config_storage = YamlConfigStorage(config) 109 | urls_storage = UrlsYaml(urls) 110 | cache_storage = CacheMiniDBStorage(cache) 111 | try: 112 | urlwatch_config = TestConfig(config, urls, cache, hooks, True) 113 | 114 | urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage) 115 | urlwatcher.run_jobs() 116 | finally: 117 | cache_storage.close() 118 | 119 | 120 | def test_unserialize_shell_job_without_kind(): 121 | job = JobBase.unserialize({ 122 | 'name': 'hoho', 123 | 'command': 'ls', 124 | }) 125 | assert isinstance(job, ShellJob) 126 | 127 | 128 | @raises(ValueError) 129 | def test_unserialize_with_unknown_key(): 130 | JobBase.unserialize({ 131 | 'unknown_key': 123, 132 | 'name': 'hoho', 133 | }) 134 | 135 | 136 | def prepare_retry_test(): 137 | urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'test', 'data', 'invalid-url.yaml') 138 | config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') 139 | cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') 140 | hooks = '' 141 | 142 | config_storage = YamlConfigStorage(config) 143 | cache_storage = CacheMiniDBStorage(cache) 144 | urls_storage = UrlsYaml(urls) 145 | 146 | urlwatch_config = TestConfig(config, urls, cache, hooks, True) 147 | urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage) 148 | 149 | return urlwatcher, cache_storage 150 | 151 | 152 | @with_setup(teardown=teardown_func) 153 | def test_number_of_tries_in_cache_is_increased(): 154 | urlwatcher, cache_storage = prepare_retry_test() 155 | try: 156 | job = urlwatcher.jobs[0] 157 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 158 | assert tries == 0 159 | 160 | urlwatcher.run_jobs() 161 | urlwatcher.run_jobs() 162 | 163 | job = urlwatcher.jobs[0] 164 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 165 | 166 | assert tries == 2 167 | assert urlwatcher.report.job_states[-1].verb == 'error' 168 | finally: 169 | cache_storage.close() 170 | 171 | 172 | @with_setup(teardown=teardown_func) 173 | def test_report_error_when_out_of_tries(): 174 | urlwatcher, cache_storage = prepare_retry_test() 175 | try: 176 | job = urlwatcher.jobs[0] 177 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 178 | assert tries == 0 179 | 180 | urlwatcher.run_jobs() 181 | urlwatcher.run_jobs() 182 | 183 | report = urlwatcher.report 184 | assert report.job_states[-1].verb == 'error' 185 | finally: 186 | cache_storage.close() 187 | 188 | 189 | @with_setup(teardown=teardown_func) 190 | def test_reset_tries_to_zero_when_successful(): 191 | urlwatcher, cache_storage = prepare_retry_test() 192 | try: 193 | job = urlwatcher.jobs[0] 194 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 195 | assert tries == 0 196 | 197 | urlwatcher.run_jobs() 198 | 199 | job = urlwatcher.jobs[0] 200 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 201 | assert tries == 1 202 | 203 | # use an url that definitely exists 204 | job = urlwatcher.jobs[0] 205 | job.url = 'file://' + os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') 206 | 207 | urlwatcher.run_jobs() 208 | 209 | job = urlwatcher.jobs[0] 210 | old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid()) 211 | assert tries == 0 212 | finally: 213 | cache_storage.close() 214 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). 6 | 7 | ## [Unreleased] 8 | 9 | ### Added 10 | - New filter: `re.sub` that can replace/remove strings using regular expressions 11 | 12 | ### Fixed 13 | - Fix `--test-filter` when the specified job is not found 14 | 15 | ### Changed 16 | - Nicer formatting of `--features` for jobs with no docstring or many keys 17 | 18 | 19 | ## [2.17] -- 2019-04-12 20 | 21 | ### Added 22 | - XPath/CSS: Support for excluding elements (#333, by Chenfeng Bao) 23 | - Add support for using external `diff_tool` on Windows (#373, by Chenfeng Bao) 24 | - Document how to use Amazon Simple E-Mail Service "SES" (by mborsetti) 25 | - Compare data with multiple old versions (`compared_versions`, #328, by Chenfeng Bao) 26 | 27 | ### Fixed 28 | - YAML: Fix deprecation warnings (#367, by Florent Aide) 29 | - Updated manpage with new options: Authentication, filter tests (Fixes #351) 30 | - Text formatter: Do not emit empty lines for `line_length=0` (Fixes #357) 31 | 32 | ### Changed 33 | - SMTP configuration fix: Only use smtp.user config if it's a non-empty value 34 | 35 | 36 | ## [2.16] -- 2019-01-27 37 | 38 | ### Added 39 | - XPath: Handle `/text()` selector (#282) 40 | - Document how to specify cookies to README.md (#264) 41 | - Text Reporter: `minimal` config option to only print a summary (PR#304, fixes #147) 42 | - README.md: Document how to watch Github releases via XPath (#266) 43 | - Support for parsing XML/RSS with XPath (Fixes #281) 44 | - Allow explicit setting of `encoding` for URL jobs (PR#313, contributes to #306) 45 | - Slack Channel Reporter (PR#309) 46 | - ANSI color output on the Windows console via `colorama` (PR#296, closes #295) 47 | - Support for using CSS selectors via the `cssselect` module (PR#321, closes 273) 48 | - `ignore_http_error_codes` is now an option for URL jobs (PR#325, fixes #203) 49 | - `job_defaults` in the config for globally specifying settings (PR#345, closes #253) 50 | - Optional `timeout` (in seconds) for URL jobs to specify socket timeout (PR#348, closes #340) 51 | 52 | ### Removed 53 | - Support for JSON storage (dead code that was never used in production; PR#336) 54 | 55 | ### Changed 56 | - `HtmlReporter` now also highlights links for browser jobs (PR#303) 57 | - Allow `--features` and `--edit-*` to run without `urls.yaml` (PR#301) 58 | - When a previous run had errors, do not use conditional GETs (PR#313, fixes #292) 59 | - Explicitly specify JSON pretty print `separators` for consistency (PR#343) 60 | - Use data-driven unit tests/fixtures for easier unit test maintenance (PR#344) 61 | 62 | ### Fixed 63 | - Fix migration issues with case-insensitive filesystems (#223) 64 | - Correctly reset retry counter when job is added or unchanged (PR#291, PR#314) 65 | - Fix a `FutureWarning` on Python 3.7 with regard to regular expressions (PR#299) 66 | - If the filter list is empty, do not process the filter list (PR#308) 67 | - Fix parsing/sanity-checking of `urls.yaml` after editing (PR#317, fixes #316) 68 | - Fix Python 3.3 compatibility by depending on `enum34` there (PR#311) 69 | - Allow running unit tests on Windows (PR#318) 70 | - Fix migration issues introduced by PR#180 and #256 (PR#323, fixes #267) 71 | 72 | 73 | ## [2.15] -- 2018-10-23 74 | 75 | ### Added 76 | - Support for Mailgun regions (by Daniel Peukert, PR#280) 77 | - CLI: Allow multiple occurences of 'filter' when adding jobs (PR#278) 78 | 79 | ### Changed 80 | - Fixed incorrect name for chat_id config in the default config (by Robin B, PR#276) 81 | 82 | 83 | ## [2.14] -- 2018-08-30 84 | 85 | ### Added 86 | - Filter to pretty-print JSON data: `format-json` (by Niko Böckerman, PR#250) 87 | - List active Telegram chats using `--telegram-chats` (with fixes by Georg Pichler, PR#270) 88 | - Support for HTTP `ETag` header in URL jobs and `If-None-Match` (by Karol Babioch, PR#256) 89 | - Support for filtering HTML using XPath expressions, with `lxml` (PR#274, Fixes #226) 90 | - Added `install_dependencies` to `setup.py` commands for easy installing of dependencies 91 | - Added `ignore_connection_errors` per-job configuration option (by Karol Babioch, PR#261) 92 | 93 | ### Changed 94 | - Improved code (HTTP status codes, by Karol Babioch PR#258) 95 | - Improved documentation for setting up Telegram chat bots 96 | - Allow multiple chats for Telegram reporting (by Georg Pichler, PR#271) 97 | 98 | 99 | ## [2.13] -- 2018-06-03 100 | 101 | ### Added 102 | - Support for specifying a `diff_tool` (e.g. `wdiff`) for each job (Fixes #243) 103 | - Support for testing filters via `--test-filter JOB` (Fixes #237) 104 | 105 | ### Changed 106 | - Moved ChangeLog file to CHANGELOG.md and using Keep a Changelog format. 107 | - Force version check in `setup.py`, to exclude Python 2 (Fixes #244) 108 | - Remove default parameter from internal `html2text` module (Fixes #239) 109 | - Better error/exception reporting in `--verbose` mode (Fixes #164) 110 | 111 | ### Removed 112 | - Old ChangeLog entries 113 | 114 | 115 | ## [2.12] -- 2018-06-01 116 | 117 | ### Fixed 118 | - Bugfix: Do not 'forget' old data if an exception occurs (Fixes #242) 119 | 120 | 121 | ## [2.11] -- 2018-05-19 122 | 123 | ### Fixed 124 | - Retry: Make sure `tries` is initialized to zero on load (Fixes #241) 125 | 126 | ### Changed 127 | - html2text: Make sure the bs4 method strips HTML tags (by Louis Sautier) 128 | 129 | 130 | ## [2.10] -- 2018-05-17 131 | 132 | ### Added 133 | - Browser: Add support for browser jobs using `requests-html` (Fixes #215) 134 | - Retry: Add support for optional retry count in job list (by cmichi, fixes #235) 135 | - HTTP: Add support for specifying optional headers (by Tero Mononen) 136 | 137 | ### Changed 138 | - File editing: Fix issue when `$EDITOR` contains spaces (Fixes #220) 139 | - ChangeLog: Add versions to recent ChangeLog entries (Fixes #235) 140 | 141 | 142 | ## [2.9] -- 2018-03-24 143 | 144 | ### Added 145 | - E-Mail: Add support for `--smtp-login` and document GMail SMTP usage 146 | - Pushover: Device and sound attribute (by Tobias Haupenthal) 147 | 148 | ### Changed 149 | - XDG: Move cache file to `XDG_CACHE_DIR` (by Maxime Werlen) 150 | - Migration: Unconditionally migrate urlwatch 1.x cache dirs (Fixes #206) 151 | 152 | ### Fixed 153 | - Cleanups: Fix out-of-date debug message, use https (by Jakub Wilk) 154 | 155 | 156 | ## [2.8] -- 2018-01-28 157 | 158 | ### Changed 159 | - Documentation: Mention `appdirs` (by e-dschungel) 160 | 161 | ### Fixed 162 | - SMTP: Fix handling of missing `user` field (by e-dschungel) 163 | - Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa) 164 | - Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen) 165 | 166 | 167 | ## [2.7] -- 2017-11-08 168 | 169 | ### Added 170 | - Filtering: `style` (by gvandenbroucke), `tag` (by cmichi) 171 | - New reporter: Telegram support (by gvandenbroucke) 172 | - Paths: Add `XDG_CONFIG_DIR` support (by Jelle van der Waa) 173 | 174 | ### Changed 175 | - ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent) 176 | - HTTP: Option to avoid 304 responses, `Content-Type` header (by Vinicius Massuchetto) 177 | - html2text: Configuration options (by Vinicius Massuchetto) 178 | 179 | ### Fixed 180 | - Issue #127: Fix error reporting 181 | - E-Mail: Fix encodings (by Seokjin Han), Allow `user` parameter for SMTP (by Jay Sitter) 182 | 183 | 184 | ## [2.6] -- 2016-12-04 185 | 186 | ### Added 187 | - New filters: `sha1sum`, `hexdump`, `element-by-class` 188 | - New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain) 189 | 190 | ### Changed 191 | - Improved filters: `BeautifulSoup` support for `html2txt` (by lechuckcaptain) 192 | - Improved handlers: HTTP Proxy (by lechuckcaptain); support for `file://` URIs 193 | - CI Integration: Build configuration for Travis CI (by lechuckcaptain) 194 | - Consistency: Feature list is now sorted by name 195 | 196 | ### Fixed 197 | - Issue #108: Fix creation of example files on first startup 198 | - Issue #118: Fix match filters for missing keys 199 | - Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier 200 | 201 | 202 | Older ChangeLog entries can be found in the 203 | [old ChangeLog file](https://github.com/thp/urlwatch/blob/2.12/ChangeLog), 204 | or with `git show 2.12:ChangeLog` on the command line. 205 | -------------------------------------------------------------------------------- /lib/urlwatch/jobs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import email.utils 32 | import hashlib 33 | import logging 34 | import os 35 | import re 36 | import subprocess 37 | import threading 38 | import requests 39 | import textwrap 40 | import urlwatch 41 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 42 | 43 | from .util import TrackSubClasses 44 | 45 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 46 | 47 | logger = logging.getLogger(__name__) 48 | 49 | 50 | class ShellError(Exception): 51 | """Exception for shell commands with non-zero exit code""" 52 | 53 | def __init__(self, result): 54 | Exception.__init__(self) 55 | self.result = result 56 | 57 | def __str__(self): 58 | return '%s: Exit status %d' % (self.__class__.__name__, self.result) 59 | 60 | 61 | class NotModifiedError(Exception): 62 | """Exception raised on HTTP 304 responses""" 63 | ... 64 | 65 | 66 | class JobBase(object, metaclass=TrackSubClasses): 67 | __subclasses__ = {} 68 | 69 | __required__ = () 70 | __optional__ = () 71 | 72 | def __init__(self, **kwargs): 73 | # Set optional keys to None 74 | for k in self.__optional__: 75 | if k not in kwargs: 76 | setattr(self, k, None) 77 | 78 | # Fail if any required keys are not provided 79 | for k in self.__required__: 80 | if k not in kwargs: 81 | raise ValueError('Required field %s missing: %r' % (k, kwargs)) 82 | 83 | for k, v in list(kwargs.items()): 84 | setattr(self, k, v) 85 | 86 | @classmethod 87 | def job_documentation(cls): 88 | result = [] 89 | for sc in TrackSubClasses.sorted_by_kind(cls): 90 | if sc.__doc__: 91 | result.append(' * %s - %s' % (sc.__kind__, sc.__doc__)) 92 | else: 93 | result.append(' * %s' % (sc.__kind__,)) 94 | 95 | for msg, value in ((' Required keys: ', sc.__required__), (' Optional keys: ', sc.__optional__)): 96 | if value: 97 | values = ('\n' + (len(msg) * ' ')).join(textwrap.wrap(', '.join(value), 79 - len(msg))) 98 | result.append('%s%s' % (msg, values)) 99 | result.append('') 100 | return '\n'.join(result) 101 | 102 | def get_location(self): 103 | raise NotImplementedError() 104 | 105 | def pretty_name(self): 106 | raise NotImplementedError() 107 | 108 | def serialize(self): 109 | d = {'kind': self.__kind__} 110 | d.update(self.to_dict()) 111 | return d 112 | 113 | @classmethod 114 | def unserialize(cls, data): 115 | if 'kind' not in data: 116 | # Try to auto-detect the kind of job based on the available keys 117 | kinds = [subclass.__kind__ for subclass in list(cls.__subclasses__.values()) 118 | if all(required in data for required in subclass.__required__) and not any( 119 | key not in subclass.__required__ and key not in subclass.__optional__ for key in data)] 120 | 121 | if len(kinds) == 1: 122 | kind = kinds[0] 123 | elif len(kinds) == 0: 124 | raise ValueError('Kind is not specified, and no job matches: %r' % (data,)) 125 | else: 126 | raise ValueError('Multiple kinds of jobs match %r: %r' % (data, kinds)) 127 | else: 128 | kind = data['kind'] 129 | 130 | return cls.__subclasses__[kind].from_dict(data) 131 | 132 | def to_dict(self): 133 | return {k: getattr(self, k) for keys in (self.__required__, self.__optional__) for k in keys 134 | if getattr(self, k) is not None} 135 | 136 | @classmethod 137 | def from_dict(cls, data): 138 | return cls(**{k: v for k, v in list(data.items()) if k in cls.__required__ or k in cls.__optional__}) 139 | 140 | def __repr__(self): 141 | return '<%s %s>' % (self.__kind__, ' '.join('%s=%r' % (k, v) for k, v in list(self.to_dict().items()))) 142 | 143 | def _set_defaults(self, defaults): 144 | if isinstance(defaults, dict): 145 | for key, value in defaults.items(): 146 | if key in self.__optional__ and getattr(self, key) is None: 147 | setattr(self, key, value) 148 | 149 | def with_defaults(self, config): 150 | new_job = JobBase.unserialize(self.serialize()) 151 | cfg = config.get('job_defaults') 152 | if isinstance(cfg, dict): 153 | new_job._set_defaults(cfg.get(self.__kind__)) 154 | new_job._set_defaults(cfg.get('all')) 155 | return new_job 156 | 157 | def get_guid(self): 158 | location = self.get_location() 159 | sha_hash = hashlib.new('sha1') 160 | sha_hash.update(location.encode('utf-8')) 161 | return sha_hash.hexdigest() 162 | 163 | def request_resources(self, resources): 164 | """Request external resources. 165 | 166 | Check if required resources is available in `resources` (a dict). 167 | If not, request resources and save them in `resources`. Keys for 168 | resources should typically be `self` or class, depending on the 169 | shared or exclusive nature of the resource. 170 | 171 | This method should be called sequentially on the main thread to 172 | ensure success. 173 | 174 | Args: 175 | resources: A dict storing external resources. 176 | """ 177 | ... 178 | 179 | def release_resources(self, resources): 180 | """Release external resources requested in `request_resources`.""" 181 | ... 182 | 183 | def retrieve(self, job_state): 184 | raise NotImplementedError() 185 | 186 | def format_error(self, exception, tb): 187 | return tb 188 | 189 | def ignore_error(self, exception): 190 | return False 191 | 192 | 193 | class Job(JobBase): 194 | __required__ = () 195 | __optional__ = ('name', 'filter', 'max_tries', 'diff_tool', 'compared_versions') 196 | 197 | # determine if hyperlink "a" tag is used in HtmlReporter 198 | LOCATION_IS_URL = False 199 | 200 | def pretty_name(self): 201 | return self.name if self.name else self.get_location() 202 | 203 | 204 | class ShellJob(Job): 205 | """Run a shell command and get its standard output""" 206 | 207 | __kind__ = 'shell' 208 | 209 | __required__ = ('command',) 210 | __optional__ = () 211 | 212 | def get_location(self): 213 | return self.command 214 | 215 | def retrieve(self, job_state): 216 | process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True) 217 | stdout_data, stderr_data = process.communicate() 218 | result = process.wait() 219 | if result != 0: 220 | raise ShellError(result) 221 | 222 | return stdout_data.decode('utf-8') 223 | 224 | 225 | class UrlJob(Job): 226 | """Retrieve an URL from a web server""" 227 | 228 | __kind__ = 'url' 229 | 230 | __required__ = ('url',) 231 | __optional__ = ('cookies', 'data', 'method', 'ssl_no_verify', 'ignore_cached', 'http_proxy', 'https_proxy', 232 | 'headers', 'ignore_connection_errors', 'ignore_http_error_codes', 'encoding', 'timeout', 233 | 'ignore_timeout_errors', 'ignore_too_many_redirects') 234 | 235 | LOCATION_IS_URL = True 236 | CHARSET_RE = re.compile('text/(html|plain); charset=([^;]*)') 237 | 238 | def get_location(self): 239 | return self.url 240 | 241 | def retrieve(self, job_state): 242 | headers = { 243 | 'User-agent': urlwatch.__user_agent__, 244 | } 245 | 246 | proxies = { 247 | 'http': os.getenv('HTTP_PROXY'), 248 | 'https': os.getenv('HTTPS_PROXY'), 249 | } 250 | 251 | if job_state.etag is not None: 252 | headers['If-None-Match'] = job_state.etag 253 | 254 | if job_state.timestamp is not None: 255 | headers['If-Modified-Since'] = email.utils.formatdate(job_state.timestamp) 256 | 257 | if self.ignore_cached or job_state.tries > 0: 258 | headers['If-None-Match'] = None 259 | headers['If-Modified-Since'] = email.utils.formatdate(0) 260 | headers['Cache-Control'] = 'max-age=172800' 261 | headers['Expires'] = email.utils.formatdate() 262 | 263 | if self.method is None: 264 | self.method = "GET" 265 | if self.data is not None: 266 | self.method = "POST" 267 | headers['Content-type'] = 'application/x-www-form-urlencoded' 268 | logger.info('Sending POST request to %s', self.url) 269 | 270 | if self.http_proxy is not None: 271 | proxies['http'] = self.http_proxy 272 | if self.https_proxy is not None: 273 | proxies['https'] = self.https_proxy 274 | 275 | file_scheme = 'file://' 276 | if self.url.startswith(file_scheme): 277 | logger.info('Using local filesystem (%s URI scheme)', file_scheme) 278 | return open(self.url[len(file_scheme):], 'rt').read() 279 | 280 | if self.headers: 281 | self.add_custom_headers(headers) 282 | 283 | if self.timeout is None: 284 | # default timeout 285 | timeout = 60 286 | elif self.timeout == 0: 287 | # never timeout 288 | timeout = None 289 | else: 290 | timeout = self.timeout 291 | 292 | response = requests.request(url=self.url, 293 | data=self.data, 294 | headers=headers, 295 | method=self.method, 296 | verify=(not self.ssl_no_verify), 297 | cookies=self.cookies, 298 | proxies=proxies, 299 | timeout=timeout) 300 | 301 | response.raise_for_status() 302 | if response.status_code == requests.codes.not_modified: 303 | raise NotModifiedError() 304 | 305 | # Save ETag from response into job_state, which will be saved in cache 306 | job_state.etag = response.headers.get('ETag') 307 | 308 | # If we're doing OCR, return the request content directly 309 | if self.filter is not None and 'ocr' in self.filter: 310 | return response.content 311 | 312 | # If we can't find the encoding in the headers, requests gets all 313 | # old-RFC-y and assumes ISO-8859-1 instead of UTF-8. Use the old 314 | # urlwatch behavior and try UTF-8 decoding first. 315 | content_type = response.headers.get('Content-type', '') 316 | content_type_match = self.CHARSET_RE.match(content_type) 317 | if not content_type_match and not self.encoding: 318 | try: 319 | try: 320 | try: 321 | return response.content.decode('utf-8') 322 | except UnicodeDecodeError: 323 | return response.content.decode('latin1') 324 | except UnicodeDecodeError: 325 | return response.content.decode('utf-8', 'ignore') 326 | except LookupError: 327 | # If this is an invalid encoding, decode as ascii (Debian bug 731931) 328 | return response.content.decode('ascii', 'ignore') 329 | if self.encoding: 330 | response.encoding = self.encoding 331 | 332 | return response.text 333 | 334 | def add_custom_headers(self, headers): 335 | """ 336 | Adds custom request headers from the job list (URLs) to the pre-filled dictionary `headers`. 337 | Pre-filled values of conflicting header keys (case-insensitive) are overwritten by custom value. 338 | """ 339 | headers_to_remove = [x for x in headers if x.lower() in [y.lower() for y in self.headers]] 340 | for header in headers_to_remove: 341 | headers.pop(header, None) 342 | headers.update(self.headers) 343 | 344 | def format_error(self, exception, tb): 345 | if isinstance(exception, requests.exceptions.RequestException): 346 | # Instead of a full traceback, just show the HTTP error 347 | return str(exception) 348 | return tb 349 | 350 | def ignore_error(self, exception): 351 | if isinstance(exception, requests.exceptions.ConnectionError) and self.ignore_connection_errors: 352 | return True 353 | if isinstance(exception, requests.exceptions.Timeout) and self.ignore_timeout_errors: 354 | return True 355 | if isinstance(exception, requests.exceptions.TooManyRedirects) and self.ignore_too_many_redirects: 356 | return True 357 | elif isinstance(exception, requests.exceptions.HTTPError): 358 | status_code = exception.response.status_code 359 | ignored_codes = [] 360 | if isinstance(self.ignore_http_error_codes, int) and self.ignore_http_error_codes == status_code: 361 | return True 362 | elif isinstance(self.ignore_http_error_codes, str): 363 | ignored_codes = [s.strip().lower() for s in self.ignore_http_error_codes.split(',')] 364 | elif isinstance(self.ignore_http_error_codes, list): 365 | ignored_codes = [str(s).strip().lower() for s in self.ignore_http_error_codes] 366 | return str(status_code) in ignored_codes or '%sxx' % (status_code // 100) in ignored_codes 367 | return False 368 | 369 | 370 | class BrowserJob(Job): 371 | """Retrieve an URL, emulating a real web browser""" 372 | 373 | __kind__ = 'browser' 374 | 375 | __required__ = ('navigate',) 376 | __optional__ = ('options',) 377 | 378 | LOCATION_IS_URL = True 379 | 380 | def get_location(self): 381 | return self.navigate 382 | 383 | def request_resources(self, resources): 384 | import asyncio 385 | import pyppeteer 386 | 387 | @asyncio.coroutine 388 | def _launch_browser(): 389 | browser = yield from pyppeteer.launch() 390 | for p in (yield from browser.pages()): 391 | yield from p.close() 392 | return browser 393 | 394 | if BrowserJob not in resources: 395 | event_loop = asyncio.new_event_loop() 396 | browser = event_loop.run_until_complete(_launch_browser()) 397 | loop_thread = threading.Thread(target=event_loop.run_forever) 398 | loop_thread.start() 399 | resources[BrowserJob] = { 400 | 'event_loop': event_loop, 401 | 'browser': browser, 402 | 'loop_thread': loop_thread 403 | } 404 | 405 | def release_resources(self, resources): 406 | import asyncio 407 | res = resources.get(BrowserJob) 408 | if res is not None: 409 | event_loop = res['event_loop'] 410 | browser = res['browser'] 411 | loop_thread = res['loop_thread'] 412 | event_loop.call_soon_threadsafe(event_loop.stop) 413 | loop_thread.join() 414 | event_loop.run_until_complete(browser.close()) 415 | del resources[BrowserJob] 416 | 417 | def retrieve(self, job_state): 418 | import asyncio 419 | 420 | @asyncio.coroutine 421 | def _get_content(browser): 422 | context = yield from browser.createIncognitoBrowserContext() 423 | page = yield from context.newPage() 424 | yield from page.goto(self.navigate, options=self.options) 425 | content = yield from page.content() 426 | yield from context.close() 427 | return content 428 | 429 | event_loop = job_state.resources[BrowserJob]['event_loop'] 430 | browser = job_state.resources[BrowserJob]['browser'] 431 | return asyncio.run_coroutine_threadsafe(_get_content(browser), event_loop).result() 432 | -------------------------------------------------------------------------------- /lib/urlwatch/command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import imp 32 | import logging 33 | import os 34 | import shutil 35 | import sys 36 | import requests 37 | import email.utils 38 | 39 | from .filters import FilterBase 40 | from .handler import JobState 41 | from .jobs import JobBase, UrlJob 42 | from .reporters import ReporterBase 43 | from .util import atomic_rename, edit_file 44 | from .mailer import set_password, have_password 45 | 46 | logger = logging.getLogger(__name__) 47 | 48 | 49 | class UrlwatchCommand: 50 | def __init__(self, urlwatcher): 51 | 52 | self.urlwatcher = urlwatcher 53 | self.urlwatch_config = urlwatcher.urlwatch_config 54 | 55 | def edit_hooks(self): 56 | fn_base, fn_ext = os.path.splitext(self.urlwatch_config.hooks) 57 | hooks_edit = fn_base + '.edit' + fn_ext 58 | try: 59 | if os.path.exists(self.urlwatch_config.hooks): 60 | shutil.copy(self.urlwatch_config.hooks, hooks_edit) 61 | elif self.urlwatch_config.hooks_py_example is not None and os.path.exists( 62 | self.urlwatch_config.hooks_py_example): 63 | shutil.copy(self.urlwatch_config.hooks_py_example, hooks_edit) 64 | edit_file(hooks_edit) 65 | imp.load_source('hooks', hooks_edit) 66 | atomic_rename(hooks_edit, self.urlwatch_config.hooks) 67 | print('Saving edit changes in', self.urlwatch_config.hooks) 68 | except SystemExit: 69 | raise 70 | except Exception as e: 71 | print('Parsing failed:') 72 | print('======') 73 | print(e) 74 | print('======') 75 | print('') 76 | print('The file', self.urlwatch_config.hooks, 'was NOT updated.') 77 | print('Your changes have been saved in', hooks_edit) 78 | return 1 79 | 80 | return 0 81 | 82 | def show_features(self): 83 | print() 84 | print('Supported jobs:\n') 85 | print(JobBase.job_documentation()) 86 | 87 | print('Supported filters:\n') 88 | print(FilterBase.filter_documentation()) 89 | print() 90 | print('Supported reporters:\n') 91 | print(ReporterBase.reporter_documentation()) 92 | print() 93 | return 0 94 | 95 | def list_urls(self): 96 | for idx, job in enumerate(self.urlwatcher.jobs): 97 | if self.urlwatch_config.verbose: 98 | print('%d: %s' % (idx + 1, repr(job))) 99 | else: 100 | pretty_name = job.pretty_name() 101 | location = job.get_location() 102 | if pretty_name != location: 103 | print('%d: %s ( %s )' % (idx + 1, pretty_name, location)) 104 | else: 105 | print('%d: %s' % (idx + 1, pretty_name)) 106 | return 0 107 | 108 | def _find_job(self, query): 109 | try: 110 | index = int(query) 111 | if index <= 0: 112 | return None 113 | try: 114 | return self.urlwatcher.jobs[index - 1] 115 | except IndexError: 116 | return None 117 | except ValueError: 118 | return next((job for job in self.urlwatcher.jobs if job.get_location() == query), None) 119 | 120 | def test_filter(self): 121 | job = self._find_job(self.urlwatch_config.test_filter) 122 | if job is None: 123 | print('Not found: %r' % (self.urlwatch_config.test_filter,)) 124 | return 1 125 | job = job.with_defaults(self.urlwatcher.config_storage.config) 126 | 127 | if isinstance(job, UrlJob): 128 | # Force re-retrieval of job, as we're testing filters 129 | job.ignore_cached = True 130 | 131 | resources = {} 132 | job.request_resources(resources) 133 | job_state = JobState(self.urlwatcher.cache_storage, resources, job) 134 | job_state.process() 135 | if job_state.exception is not None: 136 | raise job_state.exception 137 | print(job_state.new_data) 138 | # We do not save the job state or job on purpose here, since we are possibly modifying the job 139 | # (ignore_cached) and we do not want to store the newly-retrieved data yet (filter testing) 140 | job.release_resources(resources) 141 | return 0 142 | 143 | def modify_urls(self): 144 | save = True 145 | if self.urlwatch_config.delete is not None: 146 | job = self._find_job(self.urlwatch_config.delete) 147 | if job is not None: 148 | self.urlwatcher.jobs.remove(job) 149 | print('Removed %r' % (job,)) 150 | else: 151 | print('Not found: %r' % (self.urlwatch_config.delete,)) 152 | save = False 153 | 154 | if self.urlwatch_config.add is not None: 155 | # Allow multiple specifications of filter=, so that multiple filters can be specified on the CLI 156 | items = [item.split('=', 1) for item in self.urlwatch_config.add.split(',')] 157 | filters = [v for k, v in items if k == 'filter'] 158 | items = [(k, v) for k, v in items if k != 'filter'] 159 | d = {k: v for k, v in items} 160 | if filters: 161 | d['filter'] = ','.join(filters) 162 | 163 | job = JobBase.unserialize(d) 164 | print('Adding %r' % (job,)) 165 | self.urlwatcher.jobs.append(job) 166 | 167 | if save: 168 | self.urlwatcher.urls_storage.save(self.urlwatcher.jobs) 169 | 170 | return 0 171 | 172 | def handle_actions(self): 173 | if self.urlwatch_config.features: 174 | sys.exit(self.show_features()) 175 | if self.urlwatch_config.gc_cache: 176 | self.urlwatcher.cache_storage.gc([job.get_guid() for job in self.urlwatcher.jobs]) 177 | sys.exit(0) 178 | if self.urlwatch_config.edit: 179 | sys.exit(self.urlwatcher.urls_storage.edit(self.urlwatch_config.urls_yaml_example)) 180 | if self.urlwatch_config.edit_hooks: 181 | sys.exit(self.edit_hooks()) 182 | if self.urlwatch_config.test_filter: 183 | sys.exit(self.test_filter()) 184 | if self.urlwatch_config.list: 185 | sys.exit(self.list_urls()) 186 | if self.urlwatch_config.report_timestamps: 187 | sys.exit(self.report_timestamps()) 188 | if self.urlwatch_config.add is not None or self.urlwatch_config.delete is not None: 189 | sys.exit(self.modify_urls()) 190 | 191 | def check_edit_config(self): 192 | if self.urlwatch_config.edit_config: 193 | sys.exit(self.urlwatcher.config_storage.edit()) 194 | 195 | def check_telegram_chats(self): 196 | if self.urlwatch_config.telegram_chats: 197 | config = self.urlwatcher.config_storage.config['report'].get('telegram', None) 198 | if not config: 199 | print('You need to configure telegram in your config first (see README.md)') 200 | sys.exit(1) 201 | 202 | bot_token = config.get('bot_token', None) 203 | if not bot_token: 204 | print('You need to set up your bot token first (see README.md)') 205 | sys.exit(1) 206 | 207 | info = requests.get('https://api.telegram.org/bot{}/getMe'.format(bot_token)).json() 208 | 209 | chats = {} 210 | for chat_info in requests.get('https://api.telegram.org/bot{}/getUpdates'.format(bot_token)).json()['result']: 211 | chat = chat_info['message']['chat'] 212 | if chat['type'] == 'private': 213 | chats[str(chat['id'])] = ' '.join((chat['first_name'], chat['last_name'])) if 'last_name' in chat else chat['first_name'] 214 | 215 | if not chats: 216 | print('No chats found. Say hello to your bot at https://t.me/{}'.format(info['result']['username'])) 217 | sys.exit(1) 218 | 219 | headers = ('Chat ID', 'Name') 220 | maxchat = max(len(headers[0]), max((len(k) for k, v in chats.items()), default=0)) 221 | maxname = max(len(headers[1]), max((len(v) for k, v in chats.items()), default=0)) 222 | fmt = '%-' + str(maxchat) + 's %s' 223 | print(fmt % headers) 224 | print(fmt % ('-' * maxchat, '-' * maxname)) 225 | for k, v in sorted(chats.items(), key=lambda kv: kv[1]): 226 | print(fmt % (k, v)) 227 | print('\nChat up your bot here: https://t.me/{}'.format(info['result']['username'])) 228 | sys.exit(0) 229 | 230 | def check_test_slack(self): 231 | if self.urlwatch_config.test_slack: 232 | config = self.urlwatcher.config_storage.config['report'].get('slack', None) 233 | if not config: 234 | print('You need to configure slack in your config first (see README.md)') 235 | sys.exit(1) 236 | 237 | webhook_url = config.get('webhook_url', None) 238 | if not webhook_url: 239 | print('You need to set up your slack webhook_url first (see README.md)') 240 | sys.exit(1) 241 | 242 | info = requests.post(webhook_url, json={"text": "Test message from urlwatch, your configuration is working"}) 243 | if info.status_code == requests.codes.ok: 244 | print('Successfully sent message to Slack') 245 | sys.exit(0) 246 | else: 247 | print('Error while submitting message to Slack:{0}'.format(info.text)) 248 | sys.exit(1) 249 | 250 | def check_smtp_login(self): 251 | if self.urlwatch_config.smtp_login: 252 | config = self.urlwatcher.config_storage.config['report']['email'] 253 | smtp_config = config['smtp'] 254 | 255 | success = True 256 | 257 | if not config['enabled']: 258 | print('Please enable e-mail reporting in the config first.') 259 | success = False 260 | 261 | if config['method'] != 'smtp': 262 | print('Please set the method to SMTP for the e-mail reporter.') 263 | success = False 264 | 265 | if not smtp_config.get('auth', smtp_config.get('keyring', False)): 266 | print('Authentication must be enabled for SMTP.') 267 | success = False 268 | 269 | smtp_hostname = smtp_config['host'] 270 | if not smtp_hostname: 271 | print('Please configure the SMTP hostname in the config first.') 272 | success = False 273 | 274 | smtp_username = smtp_config.get('user', None) or config['from'] 275 | if not smtp_username: 276 | print('Please configure the SMTP user in the config first.') 277 | success = False 278 | 279 | if not success: 280 | sys.exit(1) 281 | 282 | if 'insecure_password' in smtp_config: 283 | print('The password is already set in the config (key "insecure_password").') 284 | sys.exit(0) 285 | 286 | if have_password(smtp_hostname, smtp_username): 287 | message = 'Password for %s / %s already set, update? [y/N] ' % (smtp_username, smtp_hostname) 288 | if input(message).lower() != 'y': 289 | print('Password unchanged.') 290 | sys.exit(0) 291 | 292 | if success: 293 | set_password(smtp_hostname, smtp_username) 294 | # TODO: Actually verify that the login to the server works 295 | 296 | sys.exit(0) 297 | 298 | # for COVID19tracker, produce a sorted report of each entry in the storage database along with its 299 | # last-updated timestamp and push it to a webhook. running_after_jobs is false when invoked with 300 | # --report-timestamps, and true if invoked automatically after regular jobs are run 301 | def report_timestamps(self, running_after_jobs = False): 302 | states = {} 303 | for idx, job in enumerate(self.urlwatcher.jobs): 304 | pretty_name = job.pretty_name() 305 | guid = job.get_guid() 306 | job_data = self.urlwatcher.cache_storage.load(None, guid) 307 | job_data_content = job_data[0] 308 | timestamp = job_data[1] 309 | states[pretty_name] = {'content': '', 'timestamp': timestamp, 310 | 'error': ("error" in job_data_content.lower())} 311 | 312 | # define strings to use to determine error messaging 313 | css_xpath_errors = ['cssfilter', 'xpathfilter'] 314 | loading_errors = ['read timed out', 'connection reset', 'timed out', 'server error', '503', '500', '522', '424', '401', 'timeouterror', 'failed to establish a new connection', 'max retries exceeded'] 315 | not_found_errors = ['not found for url', '404'] 316 | 317 | error_content = '' 318 | # check to see if an error was picked up 319 | if states[pretty_name]['error']: 320 | # check for css_xpath_errors strings 321 | if any(substring in job_data_content.lower() for substring in css_xpath_errors): 322 | error_content = 'Current filter rules are not working.' 323 | 324 | # check for various loading errors 325 | if any(substring in job_data_content.lower() for substring in loading_errors): 326 | error_content = 'Error loading URL. May be down.' 327 | 328 | # check for 404 specific 329 | if any(substring in job_data_content.lower() for substring in not_found_errors): 330 | error_content = 'URL not found anymore.' 331 | 332 | # did not match any checks 333 | if not error_content: 334 | error_content = 'Unknown error.' 335 | 336 | # update content 337 | states[pretty_name].update({'content': error_content}) 338 | 339 | states = sorted(states.items(), key=lambda x: x[1]['timestamp'], reverse=True) 340 | output = "" 341 | for state in states: 342 | timestamp_formatted = email.utils.formatdate(state[1]['timestamp'], localtime=1) 343 | output += ("%s|%s|%s\n" % (state[0], state[1]['timestamp'], state[1]['content'])) 344 | 345 | if not running_after_jobs: 346 | print(output) 347 | 348 | config = self.urlwatcher.config_storage.config['report'].get('timestamp_webhook', None) 349 | if not config: 350 | print('You need to configure timestamp_webhook in your config first') 351 | sys.exit(1) 352 | if not config['enabled']: 353 | print('Timestamp reporting not enabled') 354 | 355 | webhook_url = config.get('webhook_url', None) 356 | post_data = {'value1': output} 357 | result = requests.post(webhook_url, json=post_data) 358 | 359 | try: 360 | if result.status_code == requests.codes.ok: 361 | logger.info("Webhook response: ok") 362 | else: 363 | logger.error("Webhook error: {0}".format(result.text)) 364 | except ValueError: 365 | logger.error( 366 | "Failed to parse webook response. HTTP status code: {0}, content: {1}".format(result.status_code, 367 | result.content)) 368 | if not running_after_jobs: 369 | sys.exit(0) 370 | 371 | def run(self): 372 | self.check_edit_config() 373 | self.check_smtp_login() 374 | self.check_telegram_chats() 375 | self.check_test_slack() 376 | self.handle_actions() 377 | self.urlwatcher.run_jobs() 378 | 379 | # if timestamp reporting is enabled, 380 | config = self.urlwatcher.config_storage.config['report'].get('timestamp_webhook', None) 381 | if config['enabled']: 382 | self.report_timestamps(running_after_jobs = True); 383 | 384 | self.urlwatcher.close() 385 | -------------------------------------------------------------------------------- /lib/urlwatch/storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import os 32 | import stat 33 | import copy 34 | import platform 35 | from abc import ABCMeta, abstractmethod 36 | 37 | import shutil 38 | import yaml 39 | import minidb 40 | import logging 41 | 42 | from .util import atomic_rename, edit_file 43 | from .jobs import JobBase, UrlJob, ShellJob 44 | 45 | logger = logging.getLogger(__name__) 46 | 47 | DEFAULT_CONFIG = { 48 | 'display': { 49 | 'new': True, 50 | 'error': True, 51 | 'unchanged': False, 52 | }, 53 | 54 | 'report': { 55 | 'text': { 56 | 'line_length': 75, 57 | 'details': True, 58 | 'footer': True, 59 | 'minimal': False, 60 | }, 61 | 62 | 'html': { 63 | 'diff': 'unified', # "unified" or "table" 64 | }, 65 | 66 | 'stdout': { 67 | 'enabled': True, 68 | 'color': True, 69 | }, 70 | 71 | 'email': { 72 | 'enabled': False, 73 | 74 | 'html': False, 75 | 'to': '', 76 | 'from': '', 77 | 'subject': '{count} changes: {jobs}', 78 | 'method': 'smtp', 79 | 'smtp': { 80 | 'host': 'localhost', 81 | 'user': '', 82 | 'port': 25, 83 | 'starttls': True, 84 | 'auth': True, 85 | }, 86 | 'sendmail': { 87 | 'path': 'sendmail', 88 | } 89 | }, 90 | 'pushover': { 91 | 'enabled': False, 92 | 'app': '', 93 | 'device': None, 94 | 'sound': 'spacealarm', 95 | 'user': '', 96 | }, 97 | 'pushbullet': { 98 | 'enabled': False, 99 | 'api_key': '', 100 | }, 101 | 'telegram': { 102 | 'enabled': False, 103 | 'bot_token': '', 104 | 'chat_id': '', 105 | }, 106 | 'slack': { 107 | 'enabled': False, 108 | 'webhook_url': '', 109 | }, 110 | 'webhook': { 111 | 'enabled': False, 112 | 'webhook_url': '', 113 | }, 114 | 'timestamp_webhook': { 115 | 'enabled': False, 116 | 'webhook_url': '', 117 | }, 118 | 'mailgun': { 119 | 'enabled': False, 120 | 'region': 'us', 121 | 'api_key': '', 122 | 'domain': '', 123 | 'from_mail': '', 124 | 'from_name': '', 125 | 'to': '', 126 | 'subject': '{count} changes: {jobs}' 127 | }, 128 | }, 129 | 130 | 'job_defaults': { 131 | 'all': {}, 132 | 'shell': {}, 133 | 'url': {}, 134 | 'browser': {} 135 | } 136 | } 137 | 138 | 139 | def merge(source, destination): 140 | # http://stackoverflow.com/a/20666342 141 | for key, value in source.items(): 142 | if isinstance(value, dict): 143 | # get node or create one 144 | node = destination.setdefault(key, {}) 145 | merge(value, node) 146 | else: 147 | destination[key] = value 148 | 149 | return destination 150 | 151 | 152 | def get_current_user(): 153 | try: 154 | return os.getlogin() 155 | except OSError: 156 | # If there is no controlling terminal, because urlwatch is launched by 157 | # cron, or by a systemd.service for example, os.getlogin() fails with: 158 | # OSError: [Errno 25] Inappropriate ioctl for device 159 | import pwd 160 | return pwd.getpwuid(os.getuid()).pw_name 161 | 162 | 163 | class BaseStorage(metaclass=ABCMeta): 164 | @abstractmethod 165 | def load(self, *args): 166 | ... 167 | 168 | @abstractmethod 169 | def save(self, *args): 170 | ... 171 | 172 | 173 | class BaseFileStorage(BaseStorage, metaclass=ABCMeta): 174 | def __init__(self, filename): 175 | self.filename = filename 176 | 177 | 178 | class BaseTextualFileStorage(BaseFileStorage, metaclass=ABCMeta): 179 | def __init__(self, filename): 180 | super().__init__(filename) 181 | self.config = {} 182 | self.load() 183 | 184 | @classmethod 185 | @abstractmethod 186 | def parse(cls, *args): 187 | ... 188 | 189 | def edit(self, example_file=None): 190 | fn_base, fn_ext = os.path.splitext(self.filename) 191 | file_edit = fn_base + '.edit' + fn_ext 192 | 193 | if os.path.exists(self.filename): 194 | shutil.copy(self.filename, file_edit) 195 | elif example_file is not None and os.path.exists(example_file): 196 | shutil.copy(example_file, file_edit) 197 | 198 | while True: 199 | try: 200 | edit_file(file_edit) 201 | # Check if we can still parse it 202 | if self.parse is not None: 203 | self.parse(file_edit) 204 | break # stop if no exception on parser 205 | except SystemExit: 206 | raise 207 | except Exception as e: 208 | print('Parsing failed:') 209 | print('======') 210 | print(e) 211 | print('======') 212 | print('') 213 | print('The file', file_edit, 'was NOT updated.') 214 | user_input = input("Do you want to retry the same edit? (y/n)") 215 | if user_input.lower()[0] == 'y': 216 | continue 217 | print('Your changes have been saved in', file_edit) 218 | return 1 219 | 220 | atomic_rename(file_edit, self.filename) 221 | print('Saving edit changes in', self.filename) 222 | return 0 223 | 224 | @classmethod 225 | def write_default_config(cls, filename): 226 | config_storage = cls(None) 227 | config_storage.filename = filename 228 | config_storage.save() 229 | 230 | 231 | class UrlsBaseFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): 232 | def __init__(self, filename): 233 | self.filename = filename 234 | 235 | def shelljob_security_checks(self): 236 | 237 | if platform.system() == 'Windows': 238 | return [] 239 | 240 | shelljob_errors = [] 241 | current_uid = os.getuid() 242 | 243 | dirname = os.path.dirname(self.filename) or '.' 244 | dir_st = os.stat(dirname) 245 | if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: 246 | shelljob_errors.append('%s is group/world-writable' % dirname) 247 | if dir_st.st_uid != current_uid: 248 | shelljob_errors.append('%s not owned by %s' % (dirname, get_current_user())) 249 | 250 | file_st = os.stat(self.filename) 251 | if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: 252 | shelljob_errors.append('%s is group/world-writable' % self.filename) 253 | if file_st.st_uid != current_uid: 254 | shelljob_errors.append('%s not owned by %s' % (self.filename, get_current_user())) 255 | 256 | return shelljob_errors 257 | 258 | def load_secure(self): 259 | jobs = self.load() 260 | 261 | # Security checks for shell jobs - only execute if the current UID 262 | # is the same as the file/directory owner and only owner can write 263 | shelljob_errors = self.shelljob_security_checks() 264 | if shelljob_errors and any(isinstance(job, ShellJob) for job in jobs): 265 | print(('Removing shell jobs, because %s' % (' and '.join(shelljob_errors),))) 266 | jobs = [job for job in jobs if not isinstance(job, ShellJob)] 267 | 268 | return jobs 269 | 270 | 271 | class BaseTxtFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): 272 | @classmethod 273 | def parse(cls, *args): 274 | filename = args[0] 275 | if filename is not None and os.path.exists(filename): 276 | with open(filename) as fp: 277 | for line in fp: 278 | line = line.strip() 279 | if not line or line.startswith('#'): 280 | continue 281 | 282 | if line.startswith('|'): 283 | yield ShellJob(command=line[1:]) 284 | else: 285 | args = line.split(None, 2) 286 | if len(args) == 1: 287 | yield UrlJob(url=args[0]) 288 | elif len(args) == 2: 289 | yield UrlJob(url=args[0], post=args[1]) 290 | else: 291 | raise ValueError('Unsupported line format: %r' % (line,)) 292 | 293 | 294 | class BaseYamlFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): 295 | @classmethod 296 | def parse(cls, *args): 297 | filename = args[0] 298 | if filename is not None and os.path.exists(filename): 299 | with open(filename) as fp: 300 | return yaml.load(fp, Loader=yaml.SafeLoader) 301 | 302 | 303 | class YamlConfigStorage(BaseYamlFileStorage): 304 | def load(self, *args): 305 | self.config = merge(self.parse(self.filename) or {}, copy.deepcopy(DEFAULT_CONFIG)) 306 | 307 | def save(self, *args): 308 | with open(self.filename, 'w') as fp: 309 | yaml.dump(self.config, fp, default_flow_style=False) 310 | 311 | 312 | class UrlsYaml(BaseYamlFileStorage, UrlsBaseFileStorage): 313 | 314 | @classmethod 315 | def parse(cls, *args): 316 | filename = args[0] 317 | if filename is not None and os.path.exists(filename): 318 | with open(filename) as fp: 319 | return [JobBase.unserialize(job) for job in yaml.load_all(fp, Loader=yaml.SafeLoader) if job is not None] 320 | 321 | def save(self, *args): 322 | jobs = args[0] 323 | print('Saving updated list to %r' % self.filename) 324 | 325 | with open(self.filename, 'w') as fp: 326 | yaml.dump_all([job.serialize() for job in jobs], fp, default_flow_style=False) 327 | 328 | def load(self, *args): 329 | with open(self.filename) as fp: 330 | return [JobBase.unserialize(job) for job in yaml.load_all(fp, Loader=yaml.SafeLoader) if job is not None] 331 | 332 | 333 | class UrlsTxt(BaseTxtFileStorage, UrlsBaseFileStorage): 334 | def load(self): 335 | return list(self.parse(self.filename)) 336 | 337 | def save(self, jobs): 338 | print(jobs) 339 | raise NotImplementedError() 340 | 341 | 342 | class CacheStorage(BaseFileStorage, metaclass=ABCMeta): 343 | @abstractmethod 344 | def close(self): 345 | ... 346 | 347 | @abstractmethod 348 | def get_guids(self): 349 | ... 350 | 351 | @abstractmethod 352 | def load(self, job, guid): 353 | ... 354 | 355 | @abstractmethod 356 | def save(self, job, guid, data, timestamp, tries, etag=None): 357 | ... 358 | 359 | @abstractmethod 360 | def delete(self, guid): 361 | ... 362 | 363 | @abstractmethod 364 | def clean(self, guid): 365 | ... 366 | 367 | def backup(self): 368 | for guid in self.get_guids(): 369 | data, timestamp, tries, etag = self.load(None, guid) 370 | yield guid, data, timestamp, tries, etag 371 | 372 | def restore(self, entries): 373 | for guid, data, timestamp, tries, etag in entries: 374 | self.save(None, guid, data, timestamp, tries, etag) 375 | 376 | def gc(self, known_guids): 377 | for guid in set(self.get_guids()) - set(known_guids): 378 | print('Removing: {guid}'.format(guid=guid)) 379 | self.delete(guid) 380 | 381 | for guid in known_guids: 382 | count = self.clean(guid) 383 | if count > 0: 384 | print('Removed {count} old versions of {guid}'.format(count=count, guid=guid)) 385 | 386 | 387 | class CacheDirStorage(CacheStorage): 388 | def __init__(self, filename): 389 | super().__init__(filename) 390 | if not os.path.exists(filename): 391 | os.makedirs(filename) 392 | 393 | def close(self): 394 | # No need to close 395 | return 0 396 | 397 | def _get_filename(self, guid): 398 | return os.path.join(self.filename, guid) 399 | 400 | def get_guids(self): 401 | return os.listdir(self.filename) 402 | 403 | def load(self, job, guid): 404 | filename = self._get_filename(guid) 405 | if not os.path.exists(filename): 406 | return None, None, None, None 407 | 408 | try: 409 | with open(filename) as fp: 410 | data = fp.read() 411 | except UnicodeDecodeError: 412 | with open(filename, 'rb') as fp: 413 | data = fp.read().decode('utf-8', 'ignore') 414 | 415 | timestamp = os.stat(filename)[stat.ST_MTIME] 416 | 417 | return data, timestamp, None, None 418 | 419 | def save(self, job, guid, data, timestamp, etag=None): 420 | # Timestamp and ETag are always ignored 421 | filename = self._get_filename(guid) 422 | with open(filename, 'w+') as fp: 423 | fp.write(data) 424 | 425 | def delete(self, guid): 426 | filename = self._get_filename(guid) 427 | if os.path.exists(filename): 428 | os.unlink(filename) 429 | 430 | def clean(self, guid): 431 | # We only store the latest version, no need to clean 432 | return 0 433 | 434 | 435 | class CacheEntry(minidb.Model): 436 | guid = str 437 | timestamp = int 438 | data = str 439 | tries = int 440 | etag = str 441 | 442 | 443 | class CacheMiniDBStorage(CacheStorage): 444 | def __init__(self, filename): 445 | super().__init__(filename) 446 | 447 | dirname = os.path.dirname(filename) 448 | if dirname and not os.path.isdir(dirname): 449 | os.makedirs(dirname) 450 | 451 | self.db = minidb.Store(self.filename, debug=True) 452 | self.db.register(CacheEntry) 453 | 454 | def close(self): 455 | self.db.close() 456 | self.db = None 457 | 458 | def get_guids(self): 459 | return (guid for guid, in CacheEntry.query(self.db, minidb.Function('distinct', CacheEntry.c.guid))) 460 | 461 | def load(self, job, guid): 462 | for data, timestamp, tries, etag in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp // CacheEntry.c.tries // CacheEntry.c.etag, 463 | order_by=minidb.columns(CacheEntry.c.timestamp.desc, CacheEntry.c.tries.desc), 464 | where=CacheEntry.c.guid == guid, limit=1): 465 | return data, timestamp, tries, etag 466 | 467 | return None, None, 0, None 468 | 469 | def get_history_data(self, guid, count=1): 470 | history = {} 471 | if count < 1: 472 | return history 473 | for data, timestamp in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp, 474 | order_by=minidb.columns(CacheEntry.c.timestamp.desc, CacheEntry.c.tries.desc), 475 | where=(CacheEntry.c.guid == guid) 476 | & ((CacheEntry.c.tries == 0) | (CacheEntry.c.tries == None))): # noqa 477 | if data not in history: 478 | history[data] = timestamp 479 | if len(history) >= count: 480 | break 481 | return history 482 | 483 | def save(self, job, guid, data, timestamp, tries, etag=None): 484 | self.db.save(CacheEntry(guid=guid, timestamp=timestamp, data=data, tries=tries, etag=etag)) 485 | self.db.commit() 486 | 487 | def delete(self, guid): 488 | CacheEntry.delete_where(self.db, CacheEntry.c.guid == guid) 489 | self.db.commit() 490 | 491 | def clean(self, guid): 492 | keep_id = next((CacheEntry.query(self.db, CacheEntry.c.id, where=CacheEntry.c.guid == guid, 493 | order_by=CacheEntry.c.timestamp.desc, limit=1)), (None,))[0] 494 | 495 | if keep_id is not None: 496 | result = CacheEntry.delete_where(self.db, (CacheEntry.c.guid == guid) & (CacheEntry.c.id != keep_id)) 497 | self.db.commit() 498 | return result 499 | 500 | return 0 501 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/thp/urlwatch.svg)](https://travis-ci.org/thp/urlwatch) 2 | [![Packaging status](https://repology.org/badge/tiny-repos/urlwatch.svg)](https://repology.org/metapackage/urlwatch/versions) 3 | [![PyPI version](https://badge.fury.io/py/urlwatch.svg)](https://badge.fury.io/py/urlwatch) 4 | 5 | 6 | ``` 7 | _ _ _ ____ 8 | _ _ _ __| |_ ____ _| |_ ___| |__ |___ \ 9 | | | | | '__| \ \ /\ / / _` | __/ __| '_ \ __) | 10 | | |_| | | | |\ V V / (_| | || (__| | | | / __/ 11 | \__,_|_| |_| \_/\_/ \__,_|\__\___|_| |_| |_____| 12 | 13 | ... monitors webpages for you 14 | ``` 15 | urlwatch is intended to help you watch changes in webpages and get notified 16 | (via e-mail, in your terminal or through various third party services) of any 17 | changes. The change notification will include the URL that has changed and 18 | a unified diff of what has changed. 19 | 20 | 21 | DEPENDENCIES 22 | ------------ 23 | 24 | urlwatch 2 requires: 25 | 26 | * Python 3.5 or newer 27 | * [PyYAML](http://pyyaml.org/) 28 | * [minidb](https://thp.io/2010/minidb/) 29 | * [requests](http://python-requests.org/) 30 | * [keyring](https://github.com/jaraco/keyring/) 31 | * [appdirs](https://github.com/ActiveState/appdirs) 32 | * [lxml](https://lxml.de) 33 | * [cssselect](https://cssselect.readthedocs.io) 34 | * [enum34](https://pypi.org/project/enum34/) (Python 3.3 only) 35 | 36 | The dependencies can be installed with (add `--user` to install to `$HOME`): 37 | 38 | `python3 -m venv venv;source venv/bin/activate` 39 | 40 | `python3 -m pip install -r requirements-dev.txt` 41 | 42 | 43 | Optional dependencies (install via `python3 -m pip install `): 44 | 45 | * Pushover reporter: [chump](https://github.com/karanlyons/chump/) 46 | * Pushbullet reporter: [pushbullet.py](https://github.com/randomchars/pushbullet.py) 47 | * Stdout reporter with color on Windows: [colorama](https://github.com/tartley/colorama) 48 | * "browser" job kind: [pyppeteer](https://github.com/miyakogi/pyppeteer), Python 3.6 or newer 49 | * Unit testing: [pycodestyle](http://pycodestyle.pycqa.org/en/latest/) 50 | 51 | 52 | QUICK START 53 | ----------- 54 | 55 | 1. Start `urlwatch` to migrate your old data or start fresh 56 | 2. Use `urlwatch --edit` to customize your job list (this will create/edit `urls.yaml`) 57 | 3. Use `urlwatch --edit-config` if you want to set up e-mail sending 58 | 4. Use `urlwatch --edit-hooks` if you want to write custom subclasses 59 | 5. Add `urlwatch` to your crontab (`crontab -e`) to monitor webpages periodically 60 | 61 | The checking interval is defined by how often you run `urlwatch`. 62 | You can use e.g. [crontab.guru](https://crontab.guru) to figure out the 63 | schedule expression for the checking interval, we recommend not more often 64 | than 30 minutes (this would be `*/30 * * * *`). If you have never used 65 | cron before, check out the 66 | [crontab command help](https://www.computerhope.com/unix/ucrontab.htm). 67 | 68 | On Windows, `cron` is not installed by default. Use the 69 | [Windows Task Scheduler](https://en.wikipedia.org/wiki/Windows_Task_Scheduler) 70 | instead, or see [this StackOverflow question](https://stackoverflow.com/q/132971/1047040) 71 | for alternatives. 72 | 73 | 74 | TIPS AND TRICKS 75 | --------------- 76 | 77 | Quickly adding new URLs to the job list from the command line: 78 | 79 | ```urlwatch --add url=http://example.org,name=Example``` 80 | 81 | You can pick only a given HTML element with the built-in filter, for 82 | example to extract ```
.../
``` from a page, you 83 | can use the following in your urls.yaml: 84 | ```yaml 85 | url: http://example.org/ 86 | filter: element-by-id:something 87 | ``` 88 | 89 | Also, you can chain filters, so you can run html2text on the result: 90 | ```yaml 91 | url: http://example.net/ 92 | filter: element-by-id:something,html2text 93 | ``` 94 | 95 | The example urls.yaml file also demonstrates the use of built-in 96 | filters, here 3 filters are used: html2text, line-grep and whitespace 97 | removal to get just a certain info field from a webpage: 98 | ```yaml 99 | url: https://thp.io/2008/urlwatch/ 100 | filter: html2text,grep:Current.*version,strip 101 | ``` 102 | For most cases, this means that you can specify a filter chain in 103 | your urls.yaml page without requiring a custom hook where previously 104 | you would have needed to write custom filtering code in Python. 105 | 106 | If you are using the `grep` filter, you can grep for a comma (`,`) 107 | by using `\054` (`:` does not need to be escaped separately and 108 | can be used as-is), for example to convert HTML to text, then grep 109 | for `a,b:`, and then strip whitespace, use this: 110 | 111 | ```yaml 112 | url: https://example.org/ 113 | filter: html2text,grep:a\054b:,strip 114 | ``` 115 | 116 | If you want to extract only the body tag you can use this filer: 117 | ```yaml 118 | url: https://thp.io/2008/urlwatch/ 119 | filter: element-by-tag:body 120 | ``` 121 | 122 | You can also specify an external `diff`-style tool (a tool that takes 123 | two filenames (old, new) as parameter and returns on its standard output 124 | the difference of the files), for example to use GNU `wdiff` to get 125 | word-based differences instead of line-based difference: 126 | 127 | ```yaml 128 | url: https://example.com/ 129 | diff_tool: wdiff 130 | ``` 131 | 132 | Note that `diff_tool` specifies an external command-line tool, so that 133 | tool must be installed separately (e.g. `apt install wdiff` on Debian or 134 | `brew install wdiff` on macOS). Coloring is supported for `wdiff`-style 135 | output, but potentially not for other diff tools. 136 | 137 | To filter based on an [XPath](https://www.w3.org/TR/1999/REC-xpath-19991116/) 138 | expression, you can use the `xpath` filter like so (see Microsoft's 139 | [XPath Examples](https://msdn.microsoft.com/en-us/library/ms256086(v=vs.110).aspx) 140 | page for some other examples): 141 | 142 | ```yaml 143 | url: https://example.net/ 144 | filter: xpath:/body 145 | ``` 146 | 147 | This filters only the `` element of the HTML document, stripping 148 | out everything else. 149 | 150 | To filter based on a [CSS selector](https://www.w3.org/TR/2011/REC-css3-selectors-20110929/), 151 | you can use the `css` filter like so: 152 | 153 | ```yaml 154 | url: https://example.net/ 155 | filter: css:body 156 | ``` 157 | 158 | Some limitations and extensions exist as explained in 159 | [cssselect's documentation](https://cssselect.readthedocs.io/en/latest/#supported-selectors). 160 | 161 | In some cases, it might be useful to ignore (temporary) network errors to 162 | avoid notifications being sent. While there is a `display.error` config 163 | option (defaulting to `True`) to control reporting of errors globally, to 164 | ignore network errors for specific jobs only, you can use the 165 | `ignore_connection_errors` key in the job list configuration file: 166 | 167 | ```yaml 168 | url: https://example.com/ 169 | ignore_connection_errors: true 170 | ``` 171 | 172 | Similarly, you might want to ignore some (temporary) HTTP errors on the 173 | server side: 174 | 175 | ```yaml 176 | url: https://example.com/ 177 | ignore_http_error_codes: 408, 429, 500, 502, 503, 504 178 | ``` 179 | 180 | or ignore all HTTP errors if you like: 181 | 182 | ```yaml 183 | url: https://example.com/ 184 | ignore_http_error_codes: 4xx, 5xx 185 | ``` 186 | 187 | For web pages with misconfigured HTTP headers or rare encodings, it may 188 | be useful to explicitly specify an encoding from Python's 189 | [Standard Encodings](https://docs.python.org/3/library/codecs.html#standard-encodings). 190 | 191 | ```yaml 192 | url: https://example.com/ 193 | encoding: utf-8 194 | ``` 195 | 196 | By default, url jobs timeout after 60 seconds. If you want a different timeout 197 | period, use the `timeout` key to specify it in number of seconds, or set it to 0 198 | to never timeout. 199 | 200 | ```yaml 201 | url: https://example.com/ 202 | timeout: 300 203 | ``` 204 | 205 | If you want to change some settings for all your jobs, edit the `job_defaults` 206 | section in your config file: 207 | 208 | ```yaml 209 | ... 210 | job_defaults: 211 | all: 212 | diff_tool: wdiff 213 | url: 214 | ignore_connection_errors: true 215 | ``` 216 | The above config file sets all jobs to use wdiff as diff tool, and all "url" jobs 217 | to ignore connection errors. 218 | 219 | PUSHOVER 220 | -------- 221 | 222 | You can configure urlwatch to send real time notifications about changes 223 | via Pushover(https://pushover.net/). To enable this, ensure you have the 224 | chump python package installed (see DEPENDENCIES). Then edit your config 225 | (`urlwatch --edit-config`) and enable pushover. You will also need to add 226 | to the config your Pushover user key and a unique app key (generated by 227 | registering urlwatch as an application on your Pushover account(https://pushover.net/apps/build). 228 | 229 | You can send to a specific device by using the device name, as indicated when 230 | you add or view your list of devices in the Pushover console. For example 231 | `device: 'MyPhone'`, or `device: 'MyLaptop'`. To send to *all* of your 232 | devices, set `device: null` in your config (`urlwatch --edit-config`) or leave 233 | out the device configuration completely. 234 | 235 | 236 | PUSHBULLET 237 | -------- 238 | 239 | Pushbullet notifications are configured similarly to Pushover (see above). 240 | You'll need to add to the config your Pushbullet Access Token, which you 241 | can generate at https://www.pushbullet.com/#settings 242 | 243 | TELEGRAM 244 | -------- 245 | 246 | Telegram notifications are configured using the Telegram Bot API. 247 | For this, you'll need a Bot API token and a chat id (see https://core.telegram.org/bots). 248 | Sample configuration: 249 | 250 | ```yaml 251 | telegram: 252 | bot_token: '999999999:3tOhy2CuZE0pTaCtszRfKpnagOG8IQbP5gf' # your bot api token 253 | chat_id: '88888888' # the chat id where the messages should be sent 254 | enabled: true 255 | ``` 256 | 257 | To set up Telegram, from your Telegram app, chat up BotFather (New Message, 258 | Search, "BotFather"), then say `/newbot` and follow the instructions. 259 | Eventually it will tell you the bot token (in the form seen above, 260 | `:`) - add this to your config file. 261 | 262 | You can then click on the link of your bot, which will send the message `/start`. 263 | At this point, you can use the command `urlwatch --telegram-chats` to list the 264 | private chats the bot is involved with. This is the chat ID that you need to put 265 | into the config file as `chat_id`. You may add multiple chat IDs as a YAML list: 266 | ```yaml 267 | telegram: 268 | bot_token: '999999999:3tOhy2CuZE0pTaCtszRfKpnagOG8IQbP5gf' # your bot api token 269 | chat_id: 270 | - '11111111' 271 | - '22222222' 272 | enabled: true 273 | ``` 274 | 275 | Don't forget to also enable the reporter. 276 | 277 | 278 | SLACK 279 | ----- 280 | 281 | Slack notifications are configured using "Slack Incoming Webhooks". Here is a 282 | sample configuration: 283 | 284 | ```yaml 285 | slack: 286 | webhook_url: 'https://hooks.slack.com/services/T50TXXXXXU/BDVYYYYYYY/PWTqwyFM7CcCfGnNzdyDYZ' 287 | enabled: true 288 | ``` 289 | 290 | To set up Slack, from you Slack Team, create a new app and activate "Incoming Webhooks" on 291 | a channel, you'll get a webhook URL, copy it into the configuration as seen above. 292 | 293 | You can use the command `urlwatch --test-slack` to test if the Slack integration works. 294 | 295 | 296 | BROWSER 297 | ------- 298 | 299 | If the webpage you are trying to watch runs client-side JavaScript to 300 | render the page, [Pyppeteer](https://github.com/miyakogi/pyppeteer) can 301 | now be used to render the page in a headless Chromium instance first 302 | and then use the HTML of the resulting page. 303 | 304 | Use the `browser` kind in the configuration and the `navigate` key to set the 305 | URL to retrieve. note that the normal `url` job keys are not supported 306 | for the `browser` job types at the moment, for example: 307 | 308 | ```yaml 309 | kind: browser 310 | name: "A Page With JavaScript" 311 | navigate: http://example.org/ 312 | ``` 313 | 314 | Configure the optional `options` key to pass 315 | [additional options](https://miyakogi.github.io/pyppeteer/reference.html#pyppeteer.page.Page.goto) 316 | to Pyppeteer. For example: 317 | 318 | ```yaml 319 | kind: browser 320 | name: "A Page With JavaScript" 321 | navigate: http://example.org/ 322 | options: 323 | waitUntil: networkidle0 324 | ``` 325 | 326 | 327 | E-MAIL VIA GMAIL SMTP 328 | --------------------- 329 | 330 | You need to configure your GMail account to allow for "less secure" (password-based) 331 | apps to login: 332 | 333 | 1. Go to https://myaccount.google.com/ 334 | 2. Click on "Sign-in & security" 335 | 3. Scroll all the way down to "Allow less secure apps" and enable it 336 | 337 | Now, start the configuration editor: `urlwatch --edit-config` 338 | 339 | These are the keys you need to configure (see #158): 340 | 341 | - `report/email/enabled`: `true` 342 | - `report/email/from`: `your.username@gmail.com` (edit accordingly) 343 | - `report/email/method`: `smtp` 344 | - `report/email/smtp/host`: `smtp.gmail.com` 345 | - `report/email/smtp/auth`: `true` 346 | - `report/email/smtp/port`: `587` 347 | - `report/email/smtp/starttls`: `true` 348 | - `report/email/to`: The e-mail address you want to send reports to 349 | 350 | Now, for setting the password, it's not stored in the config file, but in your 351 | keychain. To store the password, run: `urlwatch --smtp-login` and enter your 352 | password. 353 | 354 | 355 | E-MAIL VIA AMAZON SIMPLE EMAIL SERVICE (SES) 356 | -------------------------------------------- 357 | 358 | Start the configuration editor: `urlwatch --edit-config` 359 | 360 | These are the keys you need to configure: 361 | 362 | - `report/email/enabled`: `true` 363 | - `report/email/from`: `you@verified_domain.com` (edit accordingly) 364 | - `report/email/method`: `smtp` 365 | - `report/email/smtp/host`: `email-smtp.us-west-2.amazonaws.com` (edit accordingly) 366 | - `report/email/smtp/user`: `ABCDEFGHIJ1234567890` (edit accordingly) 367 | - `report/email/smtp/auth`: `true` 368 | - `report/email/smtp/port`: `587` (25 or 465 also work) 369 | - `report/email/smtp/starttls`: `true` 370 | - `report/email/to`: The e-mail address you want to send reports to 371 | 372 | The password is not stored in the config file, but in your keychain. To store 373 | the password, run: `urlwatch --smtp-login` and enter your password. 374 | 375 | 376 | SMTP LOGIN WITHOUT KEYRING 377 | -------------------------- 378 | 379 | If for whatever reason you cannot use a keyring to store your password 380 | (for example, when using it from a `cron` job) 381 | you can also set the `insecure_password` option in the SMTP config: 382 | 383 | - `report/email/smtp/auth`: `true` 384 | - `report/email/smtp/insecure_password`: `secret123` 385 | 386 | The `insecure_password` key will be preferred over the data stored in 387 | the keyring. Please note that as the name says, storing the password 388 | as plaintext in the configuration is insecure and bad practice, but 389 | for an e-mail account that's only dedicated for sending mails this 390 | might be a way. **Never ever use this with your your primary 391 | e-mail account!** Seriously! Create a throw-away GMail (or other) 392 | account just for sending out those e-mails or use local `sendmail` with 393 | a mail server configured instead of relying on SMTP and password auth. 394 | 395 | Note that this makes it really easy for your password to be picked up 396 | by software running on your machine, by other users logged into the system 397 | and/or for the password to appear in log files accidentally. 398 | 399 | 400 | TESTING FILTERS 401 | --------------- 402 | 403 | While creating your filter pipeline, you might want to preview what the filtered 404 | output looks like. You can do so by first configuring your job and then running 405 | urlwatch with the `--test-filter` command, passing in the index (from `--list`) 406 | or the URL/location of the job to be tested: 407 | 408 | ``` 409 | urlwatch --test-filter 1 # Test the first job in the list 410 | urlwatch --test-filter https://example.net/ # Test the job with the given URL 411 | ``` 412 | 413 | The output of this command will be the filtered plaintext of the job, this is the 414 | output that will (in a real urlwatch run) be the input to the diff algorithm. 415 | 416 | 417 | SENDING COOKIES 418 | --------------- 419 | 420 | It is possible to add cookies to HTTP requests for pages that need it, the YAML 421 | syntax for this is: 422 | 423 | ```yaml 424 | url: http://example.com/ 425 | cookies: 426 | Key: ValueForKey 427 | OtherKey: OtherValue 428 | ``` 429 | 430 | 431 | WATCHING GITHUB RELEASES 432 | ------------------------ 433 | 434 | This is an example how to watch the GitHub "releases" page for a given 435 | project for the latest release version, to be notified of new releases: 436 | 437 | ```yaml 438 | url: "https://github.com/thp/urlwatch/releases/latest" 439 | filter: 440 | - xpath: '(//div[contains(@class,"release-timeline-tags")]//h4)[1]/a' 441 | - html2text: re 442 | ``` 443 | 444 | 445 | USING XPATH AND CSS FILTERS WITH XML AND EXCLUSIONS 446 | --------------------------------------------------- 447 | 448 | By default, XPath and CSS filters are set up for HTML documents. However, 449 | it is possible to use them for XML documents as well (these examples parse 450 | an RSS feed and filter only the titles and publication dates): 451 | 452 | ```yaml 453 | url: 'https://heronebag.com/blog/index.xml' 454 | filter: 455 | - xpath: 456 | path: '//item/title/text()|//item/pubDate/text()' 457 | method: xml 458 | ``` 459 | ```yaml 460 | url: 'https://heronebag.com/blog/index.xml' 461 | filter: 462 | - css: 463 | selector: 'item > title, item > pubDate' 464 | method: xml 465 | - html2text: re 466 | ``` 467 | 468 | To match an element in an [XML namespace](https://www.w3.org/TR/xml-names/), 469 | use a namespace prefix before the tag name. Use a `:` to seperate the namespace 470 | prefix and the tag name in an XPath expression, and use a `|` in a CSS selector. 471 | ```yaml 472 | url: 'https://www.wired.com/feed/rss' 473 | filter: 474 | - xpath: 475 | path: '//item/media:keywords' 476 | method: xml 477 | namespaces: 478 | media: http://search.yahoo.com/mrss/ 479 | ``` 480 | ```yaml 481 | url: 'https://www.wired.com/feed/rss' 482 | filter: 483 | - css: 484 | selector: 'item > media|keywords' 485 | method: xml 486 | namespaces: 487 | media: http://search.yahoo.com/mrss/ 488 | ``` 489 | Alternatively, use the XPath expression `//*[name()='']` to bypass 490 | the namespace entirely. 491 | 492 | Another useful option with XPath and CSS filters is `exclude`. Elements selected 493 | by this `exclude` expression are removed from the final result. For example, the 494 | following job will not have any `` tag in its results: 495 | 496 | ```yaml 497 | url: https://example.org/ 498 | filter: 499 | - css: 500 | selector: 'body' 501 | exclude: 'a' 502 | ``` 503 | 504 | 505 | COMPARE WITH SEVERAL LATEST SNAPSHOTS 506 | ------------------------------------- 507 | If a webpage frequently changes between several known stable states, it may be 508 | desirable to have changes reported only if the webpage changes into a new 509 | unknown state. You can use `compared_versions` to do this. 510 | 511 | ```yaml 512 | url: https://example.com/ 513 | compared_versions: 3 514 | ``` 515 | 516 | In this example, changes are only reported if the webpage becomes different from 517 | the latest three distinct states. The differences are shown relative to the 518 | closest match. 519 | 520 | 521 | REMOVE OR REPLACE TEXT USING REGULAR EXPRESSIONS 522 | ------------------------------------------------ 523 | 524 | Just like Python's `re.sub` function, there's the possibility to apply a regular 525 | expression and either remove of replace the matched text. The following example 526 | applies the filter 3 times: 527 | 528 | 1. Just specifying a string as the value will replace the matches with the empty string. 529 | 2. Simple patterns can be replaced with another string using "pattern" as the expression and "repl" as the replacement. 530 | 3. You can use groups (`()`) and back-reference them with `\1` (etc..) to put groups into the replacement string. 531 | 532 | All features are described in Python's [re.sub](https://docs.python.org/3/library/re.html#re.sub) 533 | documentation (the `pattern` and `repl` values are passed to this function as-is, with the value 534 | of `repl` defaulting to the empty string). 535 | 536 | 537 | ```yaml 538 | kind: url 539 | url: https://example.com/ 540 | filter: 541 | - re.sub: '\s*href="[^"]*"' 542 | - re.sub: 543 | pattern: '

' 544 | repl: 'HEADING 1: ' 545 | - re.sub: 546 | pattern: ']*)>' 547 | repl: '' 548 | ``` 549 | 550 | 551 | MIGRATION FROM URLWATCH 1.x 552 | --------------------------- 553 | 554 | Migration from urlwatch 1.x should be automatic on first start. Here is a 555 | quick rundown of changes in 2.0: 556 | 557 | * URLs are stored in a YAML file now, with direct support for specifying 558 | names for jobs, different job kinds, directly applying filters, selecting 559 | the HTTP request method, specifying POST data as dictionary and much more 560 | * The cache directory has been replaced with a SQLite 3 database file 561 | "cache.db" in minidb format, storing all change history (use `--gc-cache` to 562 | remove old changes if you don't need them anymore) for further analysis 563 | * The hooks mechanism has been replaced with support for creating new job 564 | kinds by subclassing, new filters (also by subclassing) as well as new 565 | reporters (pieces of code that put the results somewhere, for example the 566 | default installation contains the "stdout" reporter that writes to the 567 | console and the "email" reporter that can send HTML and text e-mails) 568 | * A configuration file - urlwatch.yaml - has been added for specifying user 569 | preferences instead of having to supply everything via the command line 570 | 571 | 572 | CONTACT 573 | ------- 574 | 575 | Website: https://thp.io/2008/urlwatch/ 576 | 577 | E-Mail: m@thp.io 578 | -------------------------------------------------------------------------------- /lib/urlwatch/filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 4 | # Copyright (c) 2008-2019 Thomas Perl 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # 2. Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in the 15 | # documentation and/or other materials provided with the distribution. 16 | # 3. The name of the author may not be used to endorse or promote products 17 | # derived from this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | 31 | import re 32 | import logging 33 | import itertools 34 | import os 35 | import imp 36 | import html.parser 37 | import hashlib 38 | import json 39 | 40 | # for OCR 41 | from PIL import Image 42 | from io import BytesIO 43 | import pytesseract 44 | 45 | from enum import Enum 46 | from lxml import etree 47 | from lxml.cssselect import CSSSelector 48 | 49 | from .util import TrackSubClasses 50 | 51 | logger = logging.getLogger(__name__) 52 | 53 | 54 | class FilterBase(object, metaclass=TrackSubClasses): 55 | __subclasses__ = {} 56 | __anonymous_subclasses__ = [] 57 | 58 | def __init__(self, job, state): 59 | self.job = job 60 | self.state = state 61 | 62 | def _no_subfilters(self, subfilter): 63 | if subfilter is not None: 64 | raise ValueError('No subfilters supported for {}'.format(self.__kind__)) 65 | 66 | @classmethod 67 | def filter_documentation(cls): 68 | result = [] 69 | for sc in TrackSubClasses.sorted_by_kind(cls): 70 | result.extend(( 71 | ' * %s - %s' % (sc.__kind__, sc.__doc__), 72 | )) 73 | return '\n'.join(result) 74 | 75 | @classmethod 76 | def auto_process(cls, state, data): 77 | filters = itertools.chain((filtercls for _, filtercls in 78 | sorted(cls.__subclasses__.items(), key=lambda k_v: k_v[0])), 79 | cls.__anonymous_subclasses__) 80 | 81 | for filtercls in filters: 82 | filter_instance = filtercls(state.job, state) 83 | if filter_instance.match(): 84 | logger.info('Auto-applying filter %r to %s', filter_instance, state.job.get_location()) 85 | data = filter_instance.filter(data) 86 | 87 | return data 88 | 89 | @classmethod 90 | def process(cls, filter_kind, subfilter, state, data): 91 | logger.info('Applying filter %r, subfilter %r to %s', filter_kind, subfilter, state.job.get_location()) 92 | filtercls = cls.__subclasses__.get(filter_kind, None) 93 | if filtercls is None: 94 | raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter)) 95 | return filtercls(state.job, state).filter(data, subfilter) 96 | 97 | def match(self): 98 | return False 99 | 100 | def filter(self, data, subfilter=None): 101 | raise NotImplementedError() 102 | 103 | 104 | class AutoMatchFilter(FilterBase): 105 | """Automatically matches subclass filters with a given location""" 106 | MATCH = None 107 | 108 | def match(self): 109 | if self.MATCH is None: 110 | return False 111 | 112 | d = self.job.to_dict() 113 | result = all(d.get(k, None) == v for k, v in self.MATCH.items()) 114 | logger.debug('Matching %r with %r result: %r', self, self.job, result) 115 | return result 116 | 117 | 118 | class RegexMatchFilter(FilterBase): 119 | """Same as AutoMatchFilter but matching is done with regexes""" 120 | MATCH = None 121 | 122 | def match(self): 123 | if self.MATCH is None: 124 | return False 125 | 126 | d = self.job.to_dict() 127 | 128 | # It's a match if we have at least one key/value pair that matches, 129 | # and no key/value pairs that do not match 130 | matches = [v.match(d[k]) for k, v in self.MATCH.items() if k in d] 131 | result = len(matches) > 0 and all(matches) 132 | logger.debug('Matching %r with %r result: %r', self, self.job, result) 133 | return result 134 | 135 | 136 | class LegacyHooksPyFilter(FilterBase): 137 | FILENAME = os.path.expanduser('~/.urlwatch/lib/hooks.py') 138 | 139 | def __init__(self, job, state): 140 | super().__init__(job, state) 141 | 142 | self.hooks = None 143 | if os.path.exists(self.FILENAME): 144 | try: 145 | self.hooks = imp.load_source('legacy_hooks', self.FILENAME) 146 | except Exception as e: 147 | logger.error('Could not load legacy hooks file: %s', e) 148 | 149 | def match(self): 150 | return self.hooks is not None 151 | 152 | def filter(self, data, subfilter=None): 153 | try: 154 | result = self.hooks.filter(self.job.get_location(), data) 155 | if result is None: 156 | result = data 157 | return result 158 | except Exception as e: 159 | logger.warn('Could not apply legacy hooks filter: %s', e) 160 | return data 161 | 162 | 163 | class Html2TextFilter(FilterBase): 164 | """Convert HTML to plaintext""" 165 | 166 | __kind__ = 'html2text' 167 | 168 | def filter(self, data, subfilter=None): 169 | 170 | if subfilter is None: 171 | method = 're' 172 | options = {} 173 | elif isinstance(subfilter, dict): 174 | method = subfilter.pop('method') 175 | options = subfilter 176 | elif isinstance(subfilter, str): 177 | method = subfilter 178 | options = {} 179 | from .html2txt import html2text 180 | return html2text(data, method=method, options=options) 181 | 182 | 183 | class Ical2TextFilter(FilterBase): 184 | """Convert iCalendar to plaintext""" 185 | 186 | __kind__ = 'ical2text' 187 | 188 | def filter(self, data, subfilter=None): 189 | self._no_subfilters(subfilter) 190 | from .ical2txt import ical2text 191 | return ical2text(data) 192 | 193 | 194 | class JsonFormatFilter(FilterBase): 195 | """Convert to formatted json""" 196 | 197 | __kind__ = 'format-json' 198 | 199 | def filter(self, data, subfilter=None): 200 | indentation = 4 201 | if subfilter is not None: 202 | indentation = int(subfilter) 203 | parsed_json = json.loads(data) 204 | return json.dumps(parsed_json, ensure_ascii=False, sort_keys=True, indent=indentation, separators=(',', ': ')) 205 | 206 | 207 | class GrepFilter(FilterBase): 208 | """Filter only lines matching a regular expression""" 209 | 210 | __kind__ = 'grep' 211 | 212 | def filter(self, data, subfilter=None): 213 | if subfilter is None: 214 | raise ValueError('The grep filter needs a regular expression') 215 | 216 | return '\n'.join(line for line in data.splitlines() 217 | if re.search(subfilter, line) is not None) 218 | 219 | 220 | class InverseGrepFilter(FilterBase): 221 | """Filter which removes lines matching a regular expression""" 222 | 223 | __kind__ = 'grepi' 224 | 225 | def filter(self, data, subfilter=None): 226 | if subfilter is None: 227 | raise ValueError('The inverse grep filter needs a regular expression') 228 | 229 | return '\n'.join(line for line in data.splitlines() 230 | if re.search(subfilter, line) is None) 231 | 232 | 233 | class StripFilter(FilterBase): 234 | """Strip leading and trailing whitespace""" 235 | 236 | __kind__ = 'strip' 237 | 238 | def filter(self, data, subfilter=None): 239 | self._no_subfilters(subfilter) 240 | return data.strip() 241 | 242 | 243 | class CleanNewLinesFilter(FilterBase): 244 | """Clean up excessive newlines""" 245 | 246 | __kind__ = 'clean-new-lines' 247 | 248 | def filter(self, data, subfilter=None): 249 | self._no_subfilters(subfilter) 250 | 251 | lines = data.split("\n") 252 | non_empty_lines = [line for line in lines if line.strip() != ""] 253 | string_without_empty_lines = "" 254 | for line in non_empty_lines: 255 | string_without_empty_lines += line + "\n" 256 | return string_without_empty_lines.strip() 257 | 258 | 259 | class FilterBy(Enum): 260 | ATTRIBUTE = 1 261 | TAG = 2 262 | 263 | 264 | class ElementsBy(html.parser.HTMLParser): 265 | def __init__(self, filter_by, name, value=None): 266 | super().__init__() 267 | 268 | self._filter_by = filter_by 269 | if self._filter_by == FilterBy.ATTRIBUTE: 270 | self._attributes = {name: value} 271 | else: 272 | self._name = name 273 | 274 | self._result = [] 275 | self._inside = False 276 | self._elts = [] 277 | 278 | def get_html(self): 279 | return ''.join(self._result) 280 | 281 | def handle_starttag(self, tag, attrs): 282 | ad = dict(attrs) 283 | 284 | if self._filter_by == FilterBy.ATTRIBUTE and all(ad.get(k, None) == v for k, v in self._attributes.items()): 285 | self._inside = True 286 | elif self._filter_by == FilterBy.TAG and tag == self._name: 287 | self._inside = True 288 | 289 | if self._inside: 290 | self._result.append('<%s%s%s>' % (tag, ' ' if attrs else '', 291 | ' '.join('%s="%s"' % (k, v) for k, v in attrs))) 292 | self._elts.append(tag) 293 | 294 | def handle_endtag(self, tag): 295 | if self._inside: 296 | self._result.append('' % (tag,)) 297 | if tag in self._elts: 298 | t = self._elts.pop() 299 | while t != tag and self._elts: 300 | t = self._elts.pop() 301 | if not self._elts: 302 | self._inside = False 303 | 304 | def handle_data(self, data): 305 | if self._inside: 306 | self._result.append(data) 307 | 308 | 309 | class GetElementById(FilterBase): 310 | """Get an HTML element by its ID""" 311 | 312 | __kind__ = 'element-by-id' 313 | 314 | def filter(self, data, subfilter=None): 315 | if subfilter is None: 316 | raise ValueError('Need an element ID for filtering') 317 | 318 | element_by_id = ElementsBy(FilterBy.ATTRIBUTE, 'id', subfilter) 319 | element_by_id.feed(data) 320 | return element_by_id.get_html() 321 | 322 | 323 | class GetElementByClass(FilterBase): 324 | """Get all HTML elements by class""" 325 | 326 | __kind__ = 'element-by-class' 327 | 328 | def filter(self, data, subfilter=None): 329 | if subfilter is None: 330 | raise ValueError('Need an element class for filtering') 331 | 332 | element_by_class = ElementsBy(FilterBy.ATTRIBUTE, 'class', subfilter) 333 | element_by_class.feed(data) 334 | return element_by_class.get_html() 335 | 336 | 337 | class GetElementByStyle(FilterBase): 338 | """Get all HTML elements by style""" 339 | 340 | __kind__ = 'element-by-style' 341 | 342 | def filter(self, data, subfilter=None): 343 | if subfilter is None: 344 | raise ValueError('Need an element style for filtering') 345 | 346 | element_by_style = ElementsBy(FilterBy.ATTRIBUTE, 'style', subfilter) 347 | element_by_style.feed(data) 348 | return element_by_style.get_html() 349 | 350 | 351 | class GetElementByTag(FilterBase): 352 | """Get an HTML element by its tag""" 353 | 354 | __kind__ = 'element-by-tag' 355 | 356 | def filter(self, data, subfilter=None): 357 | if subfilter is None: 358 | raise ValueError('Need a tag for filtering') 359 | 360 | element_by_tag = ElementsBy(FilterBy.TAG, subfilter) 361 | element_by_tag.feed(data) 362 | return element_by_tag.get_html() 363 | 364 | class OCRFilter(FilterBase): 365 | """Return pytesseract results""" 366 | 367 | __kind__ = 'ocr' 368 | 369 | def filter(self, data, subfilter=None): 370 | image_data = Image.open(BytesIO(data)) 371 | return pytesseract.image_to_string(image_data) 372 | 373 | class Sha1Filter(FilterBase): 374 | """Calculate the SHA-1 checksum of the content""" 375 | 376 | __kind__ = 'sha1sum' 377 | 378 | def filter(self, data, subfilter=None): 379 | self._no_subfilters(subfilter) 380 | sha = hashlib.sha1() 381 | sha.update(data.encode('utf-8', 'ignore')) 382 | return sha.hexdigest() 383 | 384 | 385 | class HexdumpFilter(FilterBase): 386 | """Convert binary data to hex dump format""" 387 | 388 | __kind__ = 'hexdump' 389 | 390 | def filter(self, data, subfilter=None): 391 | self._no_subfilters(subfilter) 392 | data = bytearray(data.encode('utf-8', 'ignore')) 393 | blocks = [data[i * 16:(i + 1) * 16] for i in range(int((len(data) + (16 - 1)) / 16))] 394 | return '\n'.join('%s %s' % (' '.join('%02x' % c for c in block), 395 | ''.join((chr(c) if (c > 31 and c < 127) else '.') 396 | for c in block)) for block in blocks) 397 | 398 | 399 | class LxmlParser: 400 | EXPR_NAMES = {'css': 'a CSS selector', 401 | 'xpath': 'an XPath expression'} 402 | 403 | def __init__(self, filter_kind, subfilter, expr_key): 404 | self.filter_kind = filter_kind 405 | if subfilter is None: 406 | raise ValueError('Need %s for filtering' % (self.EXPR_NAMES[filter_kind],)) 407 | if isinstance(subfilter, str): 408 | self.expression = subfilter 409 | self.method = 'html' 410 | self.exclude = None 411 | self.namespaces = None 412 | elif isinstance(subfilter, dict): 413 | if expr_key not in subfilter: 414 | raise ValueError('Need %s for filtering' % (self.EXPR_NAMES[filter_kind],)) 415 | self.expression = subfilter[expr_key] 416 | self.method = subfilter.get('method', 'html') 417 | self.exclude = subfilter.get('exclude') 418 | self.namespaces = subfilter.get('namespaces') 419 | if self.method not in ('html', 'xml'): 420 | raise ValueError('%s method must be "html" or "xml", got %r' % (filter_kind, self.method)) 421 | if self.method == 'html' and self.namespaces is not None: 422 | raise ValueError('Namespace prefixes only supported with "xml" method.') 423 | else: 424 | raise ValueError('%s subfilter must be a string or dict' % (filter_kind,)) 425 | self.parser = (etree.HTMLParser if self.method == 'html' else etree.XMLParser)() 426 | self.data = '' 427 | 428 | def feed(self, data): 429 | self.data += data 430 | 431 | def _to_string(self, element): 432 | # Handle "/text()" selector, which returns lxml.etree._ElementUnicodeResult (Issue #282) 433 | if isinstance(element, str): 434 | return element 435 | 436 | return etree.tostring(element, pretty_print=True, method=self.method, encoding='unicode', with_tail=False) 437 | 438 | @staticmethod 439 | def _remove_element(element): 440 | parent = element.getparent() 441 | if parent is None: 442 | # Do not exclude root element 443 | return 444 | if isinstance(element, etree._ElementUnicodeResult): 445 | if element.is_tail: 446 | parent.tail = None 447 | elif element.is_text: 448 | parent.text = None 449 | elif element.is_attribute: 450 | del parent.attrib[element.attrname] 451 | else: 452 | previous = element.getprevious() 453 | if element.tail is not None: 454 | if previous is not None: 455 | previous.tail = previous.tail + element.tail if previous.tail else element.tail 456 | else: 457 | parent.text = parent.text + element.tail if parent.text else element.tail 458 | parent.remove(element) 459 | 460 | def _reevaluate(self, element): 461 | if self._orphaned(element): 462 | return None 463 | if isinstance(element, etree._ElementUnicodeResult): 464 | parent = element.getparent() 465 | if parent is None: 466 | return element 467 | if element.is_tail: 468 | return parent.tail 469 | elif element.is_text: 470 | return parent.text 471 | elif element.is_attribute: 472 | return parent.attrib.get(element.attrname) 473 | else: 474 | return element 475 | 476 | def _orphaned(self, element): 477 | if isinstance(element, etree._ElementUnicodeResult): 478 | parent = element.getparent() 479 | if ((element.is_tail and parent.tail is None) 480 | or (element.is_text and parent.text is None) 481 | or (element.is_attribute and parent.attrib.get(element.attrname) is None)): 482 | return True 483 | else: 484 | element = parent 485 | try: 486 | tree = element.getroottree() 487 | path = tree.getpath(element) 488 | return element is not tree.xpath(path, namespaces=self.namespaces)[0] 489 | except (ValueError, IndexError): 490 | return True 491 | 492 | def _get_filtered_elements(self): 493 | try: 494 | root = etree.fromstring(self.data, self.parser) 495 | except ValueError: 496 | # Strip XML declaration, for example: '' 497 | # for https://heronebag.com/blog/index.xml, an error happens, as we get a 498 | # a (Unicode) string, but the XML contains its own "encoding" declaration 499 | self.data = re.sub(r'^<[?]xml[^>]*[?]>', '', self.data) 500 | # Retry parsing with XML declaration removed (Fixes #281) 501 | root = etree.fromstring(self.data, self.parser) 502 | if root is None: 503 | return [] 504 | excluded_elems = None 505 | if self.filter_kind == 'css': 506 | selected_elems = CSSSelector(self.expression, 507 | namespaces=self.namespaces).evaluate(root) 508 | excluded_elems = CSSSelector(self.exclude, 509 | namespaces=self.namespaces).evaluate(root) if self.exclude else None 510 | elif self.filter_kind == 'xpath': 511 | selected_elems = root.xpath(self.expression, namespaces=self.namespaces) 512 | excluded_elems = root.xpath(self.exclude, namespaces=self.namespaces) if self.exclude else None 513 | if excluded_elems is not None: 514 | for el in excluded_elems: 515 | self._remove_element(el) 516 | return [el for el in map(self._reevaluate, selected_elems) if el is not None] 517 | 518 | def get_filtered_data(self): 519 | filtered_data = '\n'.join(self._to_string(element) for element in self._get_filtered_elements()) 520 | # TODO: get parent self.job object and pass here 521 | # if not filtered_data.strip(): 522 | # raise ValueError('%s did not match any markup for job: %r' % (self.filter_kind, self.job)) 523 | return filtered_data 524 | 525 | 526 | class CssFilter(FilterBase): 527 | """Filter XML/HTML using CSS selectors""" 528 | 529 | __kind__ = 'css' 530 | 531 | def filter(self, data, subfilter=None): 532 | lxml_parser = LxmlParser('css', subfilter, 'selector') 533 | lxml_parser.feed(data) 534 | filtered_data = lxml_parser.get_filtered_data() 535 | # TODO: move this raise higher in class 536 | if not filtered_data.strip(): 537 | raise ValueError('CssFilter did not match any markup for job: {}'.format(self.job)) 538 | return filtered_data 539 | 540 | 541 | class XPathFilter(FilterBase): 542 | """Filter XML/HTML using XPath expressions""" 543 | 544 | __kind__ = 'xpath' 545 | 546 | def filter(self, data, subfilter=None): 547 | lxml_parser = LxmlParser('xpath', subfilter, 'path') 548 | lxml_parser.feed(data) 549 | filtered_data = lxml_parser.get_filtered_data() 550 | # TODO: move this raise higher in class 551 | if not filtered_data.strip(): 552 | raise ValueError('XPathFilter did not match any markup for job: {}'.format(self.job)) 553 | return filtered_data 554 | 555 | 556 | class RegexSub(FilterBase): 557 | """Replace text with regular expressions using Python's re.sub""" 558 | 559 | __kind__ = 're.sub' 560 | 561 | def filter(self, data, subfilter=None): 562 | if subfilter is None: 563 | raise ValueError('{} needs a subfilter'.format(self.__kind__)) 564 | 565 | # Allow for just specifying a regular expression (that will be removed) 566 | if isinstance(subfilter, str): 567 | subfilter = {'pattern': subfilter} 568 | 569 | # Default: Replace with empty string if no "repl" value is set 570 | return re.sub(subfilter.get('pattern'), subfilter.get('repl', ''), data) 571 | -------------------------------------------------------------------------------- /lib/urlwatch/reporters.py: -------------------------------------------------------------------------------- 1 | # 2 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/). 3 | # Copyright (c) 2008-2019 Thomas Perl 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions 8 | # are met: 9 | # 10 | # 1. Redistributions of source code must retain the above copyright 11 | # notice, this list of conditions and the following disclaimer. 12 | # 2. Redistributions in binary form must reproduce the above copyright 13 | # notice, this list of conditions and the following disclaimer in the 14 | # documentation and/or other materials provided with the distribution. 15 | # 3. The name of the author may not be used to endorse or promote products 16 | # derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | import difflib 31 | import tempfile 32 | import subprocess 33 | import re 34 | import shlex 35 | import email.utils 36 | import itertools 37 | import logging 38 | import os 39 | import sys 40 | import time 41 | import html 42 | import functools 43 | 44 | import requests 45 | 46 | import urlwatch 47 | from .mailer import SMTPMailer 48 | from .mailer import SendmailMailer 49 | from .util import TrackSubClasses 50 | 51 | try: 52 | import chump 53 | except ImportError: 54 | chump = None 55 | 56 | try: 57 | from pushbullet import Pushbullet 58 | except ImportError: 59 | Pushbullet = None 60 | 61 | logger = logging.getLogger(__name__) 62 | 63 | 64 | # Regular expressions that match the added/removed markers of GNU wdiff output 65 | WDIFF_ADDED_RE = r'[{][+].*?[+][}]' 66 | WDIFF_REMOVED_RE = r'[\[][-].*?[-][]]' 67 | 68 | 69 | class ReporterBase(object, metaclass=TrackSubClasses): 70 | __subclasses__ = {} 71 | 72 | def __init__(self, report, config, job_states, duration): 73 | self.report = report 74 | self.config = config 75 | self.job_states = job_states 76 | self.duration = duration 77 | 78 | def convert(self, othercls): 79 | if hasattr(othercls, '__kind__'): 80 | config = self.report.config['report'][othercls.__kind__] 81 | else: 82 | config = {} 83 | 84 | return othercls(self.report, config, self.job_states, self.duration) 85 | 86 | @classmethod 87 | def reporter_documentation(cls): 88 | result = [] 89 | for sc in TrackSubClasses.sorted_by_kind(cls): 90 | result.extend(( 91 | ' * %s - %s' % (sc.__kind__, sc.__doc__), 92 | )) 93 | return '\n'.join(result) 94 | 95 | @classmethod 96 | def submit_all(cls, report, job_states, duration): 97 | any_enabled = False 98 | for name, subclass in cls.__subclasses__.items(): 99 | cfg = report.config['report'].get(name, {'enabled': False}) 100 | if cfg['enabled']: 101 | any_enabled = True 102 | logger.info('Submitting with %s (%r)', name, subclass) 103 | subclass(report, cfg, job_states, duration).submit() 104 | 105 | if not any_enabled: 106 | logger.warn('No reporters enabled.') 107 | 108 | def submit(self): 109 | raise NotImplementedError() 110 | 111 | def unified_diff(self, job_state): 112 | if job_state.job.diff_tool is not None: 113 | with tempfile.TemporaryDirectory() as tmpdir: 114 | old_file_path = os.path.join(tmpdir, 'old_file') 115 | new_file_path = os.path.join(tmpdir, 'new_file') 116 | with open(old_file_path, 'w+b') as old_file, open(new_file_path, 'w+b') as new_file: 117 | old_file.write(job_state.old_data.encode('utf-8')) 118 | new_file.write(job_state.new_data.encode('utf-8')) 119 | cmdline = shlex.split(job_state.job.diff_tool) + [old_file_path, new_file_path] 120 | proc = subprocess.Popen(cmdline, stdout=subprocess.PIPE) 121 | stdout, _ = proc.communicate() 122 | # Diff tools return 0 for "nothing changed" or 1 for "files differ", anything else is an error 123 | if proc.returncode in (0, 1): 124 | return stdout.decode('utf-8') 125 | else: 126 | raise subprocess.CalledProcessError(proc.returncode, cmdline) 127 | 128 | timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1) 129 | timestamp_new = email.utils.formatdate(time.time(), localtime=1) 130 | return ''.join(difflib.unified_diff([l + '\n' for l in job_state.old_data.splitlines()], 131 | [l + '\n' for l in job_state.new_data.splitlines()], 132 | '@', '@', timestamp_old, timestamp_new)) 133 | 134 | 135 | class SafeHtml(object): 136 | def __init__(self, s): 137 | self.s = s 138 | 139 | def __str__(self): 140 | return self.s 141 | 142 | def format(self, *args, **kwargs): 143 | return str(self).format(*(html.escape(str(arg)) for arg in args), 144 | **{k: html.escape(str(v)) for k, v in kwargs.items()}) 145 | 146 | 147 | class HtmlReporter(ReporterBase): 148 | def submit(self): 149 | yield from (str(part) for part in self._parts()) 150 | 151 | def _parts(self): 152 | cfg = self.report.config['report']['html'] 153 | 154 | yield SafeHtml(""" 155 | 156 | urlwatch 157 | 158 | 159 | 170 | 171 | """) 172 | 173 | for job_state in self.report.get_filtered_job_states(self.job_states): 174 | job = job_state.job 175 | 176 | if job.LOCATION_IS_URL: 177 | title = '{pretty_name}' 178 | elif job.pretty_name() != job.get_location(): 179 | title = '{pretty_name}' 180 | else: 181 | title = '{location}' 182 | title = '

{verb}: ' + title + '

' 183 | 184 | yield SafeHtml(title).format(verb=job_state.verb, 185 | location=job.get_location(), 186 | pretty_name=job.pretty_name()) 187 | 188 | content = self._format_content(job_state, cfg['diff']) 189 | if content is not None: 190 | yield content 191 | 192 | yield SafeHtml('
') 193 | 194 | yield SafeHtml(""" 195 |
196 | {pkgname} {version}, {copyright}
197 | Website: {url}
198 | watched {count} URLs in {duration} seconds 199 |
200 | 201 | 202 | """).format(pkgname=urlwatch.pkgname, version=urlwatch.__version__, copyright=urlwatch.__copyright__, 203 | url=urlwatch.__url__, count=len(self.job_states), duration=self.duration.seconds) 204 | 205 | def _diff_to_html(self, unified_diff): 206 | for line in unified_diff.splitlines(): 207 | if line.startswith('+'): 208 | yield SafeHtml('{line}').format(line=line) 209 | elif line.startswith('-'): 210 | yield SafeHtml('{line}').format(line=line) 211 | else: 212 | yield SafeHtml('{line}').format(line=line) 213 | 214 | def _format_content(self, job_state, difftype): 215 | if job_state.verb == 'error': 216 | return SafeHtml('
{error}
').format(error=job_state.traceback.strip()) 217 | 218 | if job_state.verb == 'unchanged': 219 | return SafeHtml('
{old_data}
').format(old_data=job_state.old_data) 220 | 221 | if job_state.old_data in (None, job_state.new_data): 222 | return SafeHtml('...') 223 | 224 | if difftype == 'table': 225 | timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1) 226 | timestamp_new = email.utils.formatdate(time.time(), localtime=1) 227 | html_diff = difflib.HtmlDiff() 228 | return SafeHtml(html_diff.make_table(job_state.old_data.splitlines(1), job_state.new_data.splitlines(1), 229 | timestamp_old, timestamp_new, True, 3)) 230 | elif difftype == 'unified': 231 | return ''.join(( 232 | '
',
233 |                 '\n'.join(self._diff_to_html(self.unified_diff(job_state))),
234 |                 '
', 235 | )) 236 | else: 237 | raise ValueError('Diff style not supported: %r' % (difftype,)) 238 | 239 | 240 | class TextReporter(ReporterBase): 241 | def submit(self): 242 | cfg = self.report.config['report']['text'] 243 | line_length = cfg['line_length'] 244 | show_details = cfg['details'] 245 | show_footer = cfg['footer'] 246 | 247 | if cfg['minimal']: 248 | for job_state in self.report.get_filtered_job_states(self.job_states): 249 | pretty_name = job_state.job.pretty_name() 250 | location = job_state.job.get_location() 251 | if pretty_name != location: 252 | location = '%s ( %s )' % (pretty_name, location) 253 | yield ': '.join((job_state.verb.upper(), location)) 254 | return 255 | 256 | summary = [] 257 | details = [] 258 | for job_state in self.report.get_filtered_job_states(self.job_states): 259 | summary_part, details_part = self._format_output(job_state, line_length) 260 | summary.extend(summary_part) 261 | details.extend(details_part) 262 | 263 | if summary: 264 | sep = (line_length * '=') or None 265 | yield from (part for part in itertools.chain( 266 | (sep,), 267 | ('%02d. %s' % (idx + 1, line) for idx, line in enumerate(summary)), 268 | (sep, ''), 269 | ) if part is not None) 270 | 271 | if show_details: 272 | yield from details 273 | 274 | if summary and show_footer: 275 | yield from ('-- ', 276 | '%s %s, %s' % (urlwatch.pkgname, urlwatch.__version__, urlwatch.__copyright__), 277 | 'Website: %s' % (urlwatch.__url__,), 278 | 'watched %d URLs in %d seconds' % (len(self.job_states), self.duration.seconds)) 279 | 280 | def _format_content(self, job_state): 281 | if job_state.verb == 'error': 282 | return job_state.traceback.strip() 283 | 284 | if job_state.verb == 'unchanged': 285 | return job_state.old_data 286 | 287 | if job_state.old_data in (None, job_state.new_data): 288 | return None 289 | 290 | return self.unified_diff(job_state) 291 | 292 | def _format_output(self, job_state, line_length): 293 | summary_part = [] 294 | details_part = [] 295 | 296 | pretty_name = job_state.job.pretty_name() 297 | location = job_state.job.get_location() 298 | if pretty_name != location: 299 | location = '%s ( %s )' % (pretty_name, location) 300 | 301 | pretty_summary = ': '.join((job_state.verb.upper(), pretty_name)) 302 | summary = ': '.join((job_state.verb.upper(), location)) 303 | content = self._format_content(job_state) 304 | 305 | summary_part.append(pretty_summary) 306 | 307 | sep = (line_length * '-') or None 308 | details_part.extend((sep, summary, sep)) 309 | if content is not None: 310 | details_part.extend((content, sep)) 311 | details_part.extend(('', '') if sep else ('',)) 312 | details_part = [part for part in details_part if part is not None] 313 | 314 | return summary_part, details_part 315 | 316 | 317 | class StdoutReporter(TextReporter): 318 | """Print summary on stdout (the console)""" 319 | 320 | __kind__ = 'stdout' 321 | 322 | def __init__(self, *args, **kwargs): 323 | super().__init__(*args, **kwargs) 324 | self._has_color = sys.stdout.isatty() and self.config.get('color', False) 325 | 326 | def _incolor(self, color_id, s): 327 | if self._has_color: 328 | return '\033[9%dm%s\033[0m' % (color_id, s) 329 | return s 330 | 331 | def _red(self, s): 332 | return self._incolor(1, s) 333 | 334 | def _green(self, s): 335 | return self._incolor(2, s) 336 | 337 | def _yellow(self, s): 338 | return self._incolor(3, s) 339 | 340 | def _blue(self, s): 341 | return self._incolor(4, s) 342 | 343 | def _get_print(self): 344 | if sys.platform == 'win32' and self._has_color: 345 | from colorama import AnsiToWin32 346 | return functools.partial(print, file=AnsiToWin32(sys.stdout).stream) 347 | return print 348 | 349 | def submit(self): 350 | print = self._get_print() 351 | 352 | cfg = self.report.config['report']['text'] 353 | line_length = cfg['line_length'] 354 | 355 | separators = (line_length * '=', line_length * '-', '-- ') if line_length else () 356 | body = '\n'.join(super().submit()) 357 | 358 | for line in body.splitlines(): 359 | # Basic colorization for wdiff-style differences 360 | line = re.sub(WDIFF_ADDED_RE, lambda x: self._green(x.group(0)), line) 361 | line = re.sub(WDIFF_REMOVED_RE, lambda x: self._red(x.group(0)), line) 362 | 363 | # FIXME: This isn't ideal, but works for now... 364 | if line in separators: 365 | print(line) 366 | elif line.startswith('+'): 367 | print(self._green(line)) 368 | elif line.startswith('-'): 369 | print(self._red(line)) 370 | elif any(line.startswith(prefix) for prefix in ('NEW:', 'CHANGED:', 'UNCHANGED:', 'ERROR:')): 371 | first, second = line.split(' ', 1) 372 | if line.startswith('ERROR:'): 373 | print(first, self._red(second)) 374 | else: 375 | print(first, self._blue(second)) 376 | else: 377 | print(line) 378 | 379 | 380 | class EMailReporter(TextReporter): 381 | """Send summary via e-mail / SMTP""" 382 | 383 | __kind__ = 'email' 384 | 385 | def submit(self): 386 | filtered_job_states = list(self.report.get_filtered_job_states(self.job_states)) 387 | 388 | subject_args = { 389 | 'count': len(filtered_job_states), 390 | 'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states), 391 | } 392 | subject = self.config['subject'].format(**subject_args) 393 | 394 | body_text = '\n'.join(super().submit()) 395 | 396 | if not body_text: 397 | logger.debug('Not sending e-mail (no changes)') 398 | return 399 | if self.config['method'] == "smtp": 400 | smtp_user = self.config['smtp'].get('user', None) or self.config['from'] 401 | # Legacy support: The current smtp "auth" setting was previously called "keyring" 402 | if 'keyring' in self.config['smtp']: 403 | logger.info('The SMTP config key "keyring" is now called "auth".') 404 | use_auth = self.config['smtp'].get('auth', self.config['smtp'].get('keyring', False)) 405 | mailer = SMTPMailer(smtp_user, self.config['smtp']['host'], self.config['smtp']['port'], 406 | self.config['smtp']['starttls'], use_auth, 407 | self.config['smtp'].get('insecure_password')) 408 | elif self.config['method'] == "sendmail": 409 | mailer = SendmailMailer(self.config['sendmail']['path']) 410 | else: 411 | logger.error('Invalid entry for method {method}'.format(method=self.config['method'])) 412 | 413 | if self.config['html']: 414 | body_html = '\n'.join(self.convert(HtmlReporter).submit()) 415 | 416 | msg = mailer.msg_html(self.config['from'], self.config['to'], subject, body_text, body_html) 417 | else: 418 | msg = mailer.msg_plain(self.config['from'], self.config['to'], subject, body_text) 419 | 420 | mailer.send(msg) 421 | 422 | 423 | class WebServiceReporter(TextReporter): 424 | MAX_LENGTH = 1024 425 | 426 | def web_service_get(self): 427 | raise NotImplementedError 428 | 429 | def web_service_submit(self, service, title, body): 430 | raise NotImplementedError 431 | 432 | def submit(self): 433 | body_text = '\n'.join(super().submit()) 434 | 435 | if not body_text: 436 | logger.debug('Not sending %s (no changes)', self.__kind__) 437 | return 438 | 439 | if len(body_text) > self.MAX_LENGTH: 440 | body_text = body_text[:self.MAX_LENGTH] 441 | 442 | try: 443 | service = self.web_service_get() 444 | except Exception as e: 445 | logger.error('Failed to load or connect to %s - are the dependencies installed and configured?', 446 | self.__kind__, exc_info=True) 447 | return 448 | 449 | self.web_service_submit(service, 'Website Change Detected', body_text) 450 | 451 | 452 | class PushoverReport(WebServiceReporter): 453 | """Send summary via pushover.net""" 454 | 455 | __kind__ = 'pushover' 456 | 457 | def web_service_get(self): 458 | app = chump.Application(self.config['app']) 459 | return app.get_user(self.config['user']) 460 | 461 | def web_service_submit(self, service, title, body): 462 | sound = self.config['sound'] 463 | # If device is the empty string or not specified at all, use None to send to all devices 464 | # (see https://github.com/thp/urlwatch/issues/372) 465 | device = self.config.get('device', None) or None 466 | msg = service.create_message(title=title, message=body, html=True, sound=sound, device=device) 467 | msg.send() 468 | 469 | 470 | class PushbulletReport(WebServiceReporter): 471 | """Send summary via pushbullet.com""" 472 | 473 | __kind__ = 'pushbullet' 474 | 475 | def web_service_get(self): 476 | return Pushbullet(self.config['api_key']) 477 | 478 | def web_service_submit(self, service, title, body): 479 | service.push_note(title, body) 480 | 481 | 482 | class MailGunReporter(TextReporter): 483 | """Custom email reporter that uses Mailgun""" 484 | 485 | __kind__ = 'mailgun' 486 | 487 | def submit(self): 488 | region = self.config.get('region', '') 489 | domain = self.config['domain'] 490 | api_key = self.config['api_key'] 491 | from_name = self.config['from_name'] 492 | from_mail = self.config['from_mail'] 493 | to = self.config['to'] 494 | 495 | if region == 'us': 496 | region = '' 497 | 498 | if region != '': 499 | region = ".{0}".format(region) 500 | 501 | filtered_job_states = list(self.report.get_filtered_job_states(self.job_states)) 502 | subject_args = { 503 | 'count': len(filtered_job_states), 504 | 'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states), 505 | } 506 | subject = self.config['subject'].format(**subject_args) 507 | 508 | body_text = '\n'.join(super().submit()) 509 | body_html = '\n'.join(self.convert(HtmlReporter).submit()) 510 | 511 | if not body_text: 512 | logger.debug('Not calling Mailgun API (no changes)') 513 | return 514 | 515 | logger.debug("Sending Mailgun request for domain:'{0}'".format(domain)) 516 | result = requests.post( 517 | "https://api{0}.mailgun.net/v3/{1}/messages".format(region, domain), 518 | auth=("api", api_key), 519 | data={"from": "{0} <{1}>".format(from_name, from_mail), 520 | "to": to, 521 | "subject": subject, 522 | "text": body_text, 523 | "html": body_html}) 524 | 525 | try: 526 | json_res = result.json() 527 | 528 | if (result.status_code == requests.codes.ok): 529 | logger.info("Mailgun response: id '{0}'. {1}".format(json_res['id'], json_res['message'])) 530 | else: 531 | logger.error("Mailgun error: {0}".format(json_res['message'])) 532 | except ValueError: 533 | logger.error( 534 | "Failed to parse Mailgun response. HTTP status code: {0}, content: {1}".format(result.status_code, 535 | result.content)) 536 | 537 | return result 538 | 539 | 540 | class TelegramReporter(TextReporter): 541 | """Custom Telegram reporter""" 542 | MAX_LENGTH = 4096 543 | 544 | __kind__ = 'telegram' 545 | 546 | def submit(self): 547 | 548 | bot_token = self.config['bot_token'] 549 | chat_ids = self.config['chat_id'] 550 | chat_ids = [chat_ids] if isinstance(chat_ids, str) else chat_ids 551 | 552 | text = '\n'.join(super().submit()) 553 | 554 | if not text: 555 | logger.debug('Not calling telegram API (no changes)') 556 | return 557 | 558 | result = None 559 | for chunk in self.chunkstring(text, self.MAX_LENGTH): 560 | for chat_id in chat_ids: 561 | res = self.submitToTelegram(bot_token, chat_id, chunk) 562 | if res.status_code != requests.codes.ok or res is None: 563 | result = res 564 | 565 | return result 566 | 567 | def submitToTelegram(self, bot_token, chat_id, text): 568 | logger.debug("Sending telegram request to chat id:'{0}'".format(chat_id)) 569 | result = requests.post( 570 | "https://api.telegram.org/bot{0}/sendMessage".format(bot_token), 571 | data={"chat_id": chat_id, "text": text, "disable_web_page_preview": "true"}) 572 | try: 573 | json_res = result.json() 574 | 575 | if (result.status_code == requests.codes.ok): 576 | logger.info("Telegram response: ok '{0}'. {1}".format(json_res['ok'], json_res['result'])) 577 | else: 578 | logger.error("Telegram error: {0}".format(json_res['description'])) 579 | except ValueError: 580 | logger.error( 581 | "Failed to parse telegram response. HTTP status code: {0}, content: {1}".format(result.status_code, 582 | result.content)) 583 | return result 584 | 585 | def chunkstring(self, string, length): 586 | return (string[0 + i:length + i] for i in range(0, len(string), length)) 587 | 588 | 589 | class SlackReporter(TextReporter): 590 | """Custom Slack reporter""" 591 | MAX_LENGTH = 40000 592 | 593 | __kind__ = 'slack' 594 | 595 | def submit(self): 596 | webhook_url = self.config['webhook_url'] 597 | text = '\n'.join(super().submit()) 598 | 599 | if not text: 600 | logger.debug('Not calling slack API (no changes)') 601 | return 602 | 603 | result = None 604 | for chunk in self.chunkstring(text, self.MAX_LENGTH): 605 | res = self.submit_to_slack(webhook_url, chunk) 606 | if res.status_code != requests.codes.ok or res is None: 607 | result = res 608 | 609 | return result 610 | 611 | def submit_to_slack(self, webhook_url, text): 612 | logger.debug("Sending slack request with text:{0}".format(text)) 613 | post_data = {"text": text} 614 | result = requests.post(webhook_url, json=post_data) 615 | try: 616 | if result.status_code == requests.codes.ok: 617 | logger.info("Slack response: ok") 618 | else: 619 | logger.error("Slack error: {0}".format(result.text)) 620 | except ValueError: 621 | logger.error( 622 | "Failed to parse slack response. HTTP status code: {0}, content: {1}".format(result.status_code, 623 | result.content)) 624 | return result 625 | 626 | def chunkstring(self, string, length): 627 | return (string[0 + i:length + i] for i in range(0, len(string), length)) 628 | 629 | class WebhookReporter(TextReporter): 630 | """Custom IFTTT webhook reporter for Google Sheets""" 631 | 632 | __kind__ = 'webhook' 633 | 634 | def submit(self): 635 | result = None 636 | for job_state in self.report.get_filtered_job_states(self.job_states): 637 | pretty_name = job_state.job.pretty_name() 638 | location = job_state.job.get_location() 639 | content = self._format_content(job_state) 640 | 641 | res = self.post_report(pretty_name, location, content) 642 | if res.status_code != requests.codes.ok or res is None: 643 | result = res 644 | return result 645 | 646 | def post_report(self, pretty_name, location, content): 647 | webhook_url = self.config['webhook_url'] 648 | post_data = {'value1': pretty_name, 'value2': location, 'value3': content} 649 | result = requests.post(webhook_url, json=post_data) 650 | 651 | try: 652 | if result.status_code == requests.codes.ok: 653 | logger.info("Webhook response: ok") 654 | else: 655 | logger.error("Webhook error: {0}".format(result.text)) 656 | except ValueError: 657 | logger.error( 658 | "Failed to parse webook response. HTTP status code: {0}, content: {1}".format(result.status_code, 659 | result.content)) 660 | return result 661 | --------------------------------------------------------------------------------