├── .gitignore
├── setup.cfg
├── test
    ├── data
    │   ├── invalid-url.yaml
    │   ├── urlwatch.yaml
    │   ├── urls.txt
    │   └── filter_tests.yaml
    ├── test_filters.py
    └── test_handler.py
├── MANIFEST.in
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── requirements-dev.txt
├── lib
    └── urlwatch
    │   ├── __init__.py
    │   ├── ical2txt.py
    │   ├── migration.py
    │   ├── main.py
    │   ├── util.py
    │   ├── cli.py
    │   ├── mailer.py
    │   ├── worker.py
    │   ├── html2txt.py
    │   ├── config.py
    │   ├── handler.py
    │   ├── jobs.py
    │   ├── command.py
    │   ├── storage.py
    │   ├── filters.py
    │   └── reporters.py
├── COPYING
├── share
    ├── urlwatch
    │   └── examples
    │   │   ├── urls.yaml.example
    │   │   └── hooks.py.example
    └── man
    │   └── man1
    │       └── urlwatch.1
├── setup.py
├── CHANGELOG.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .idea
3 | build
4 | /venv


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [pycodestyle]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/test/data/invalid-url.yaml:
--------------------------------------------------------------------------------
1 | name: "invalid url"
2 | url: "https://invalid"
3 | max_tries: 2
4 | ---
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG.md COPYING README.md
2 | recursive-include share *
3 | recursive-include test/data *
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 |   - "3.6"
 5 |   - "3.7"
 6 |   - "3.8"
 7 | install:
 8 |   - python setup.py install_dependencies
 9 | script: nosetests -v
10 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # The COVID Tracking Project Code of Conduct
2 | 
3 | The COVID Tracking Project follows the [code of conduct defined in this document](https://github.com/COVID19Tracking/code-of-conduct/blob/master/CODE_OF_CONDUCT.md).


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | certifi==2019.11.28
 3 | chardet==3.0.4
 4 | cssselect==1.1.0
 5 | idna==2.9
 6 | keyring==21.2.0
 7 | lxml==4.5.0
 8 | minidb==2.0.2
 9 | PyYAML==5.3.1
10 | requests==2.23.0
11 | urllib3==1.25.8
12 | pytesseract==0.3.3
13 | Pillow==7.0.0
14 | 


--------------------------------------------------------------------------------
/lib/urlwatch/__init__.py:
--------------------------------------------------------------------------------
 1 | """urlwatch monitors webpages for you
 2 | 
 3 | urlwatch is intended to help you watch changes in webpages and get notified
 4 | (via e-mail, in your terminal or through various third party services) of any
 5 | changes. The change notification will include the URL that has changed and
 6 | a unified diff of what has changed.
 7 | """
 8 | 
 9 | pkgname = 'urlwatch'
10 | 
11 | __copyright__ = 'Copyright 2008-2019 Thomas Perl'
12 | __author__ = 'Thomas Perl <m@thp.io>'
13 | __license__ = 'BSD'
14 | __url__ = 'https://thp.io/2008/urlwatch/'
15 | __version__ = '2.17'
16 | __user_agent__ = '%s/%s (+https://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
17 | 


--------------------------------------------------------------------------------
/test/data/urlwatch.yaml:
--------------------------------------------------------------------------------
 1 | display:
 2 |   error: true
 3 |   new: true
 4 |   unchanged: false
 5 | report:
 6 |   email:
 7 |     enabled: false
 8 |     from: ''
 9 |     html: false
10 |     method: smtp
11 |     sendmail:
12 |       path: sendmail
13 |     smtp:
14 |       host: localhost
15 |       auth: true
16 |       port: 25
17 |       starttls: true
18 |     subject: '{count} changes: {jobs}'
19 |     to: ''
20 |   html:
21 |     diff: unified
22 |   pushover:
23 |     app: ''
24 |     device: null
25 |     enabled: false
26 |     sound: 'spacealarm'
27 |     user: ''
28 |   stdout:
29 |     color: true
30 |     enabled: true
31 |   text:
32 |     details: true
33 |     footer: true
34 |     line_length: 75
35 | job_defaults:
36 |   all: {}
37 |   shell: {}
38 |   url: {}
39 |   browser: {}
40 | 


--------------------------------------------------------------------------------
/test/data/urls.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # This is an example urls.txt file for urlwatch
 3 | # Empty lines and lines starting with "#" are ignored
 4 | 
 5 | http://www.dubclub-vienna.com/
 6 | http://www.openpandora.org/developers.php
 7 | #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/info.html
 8 | #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/blatter.html
 9 | #http://www.dbai.tuwien.ac.at/education/dbs/current/index.html
10 | #http://www.dbai.tuwien.ac.at/education/dbs/current/uebung.html
11 | http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming
12 | http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming/labor
13 | http://ti.tuwien.ac.at/rts/teaching/courses/betriebssysteme
14 | #http://www.complang.tuwien.ac.at/anton/lvas/effiziente-programme.html
15 | #http://www.complang.tuwien.ac.at/anton/lvas/effizienz-aufgabe08/
16 | http://www.kukuk.at/ical/events
17 | http://guckes.net/cal/
18 | 
19 | # You can use the pipe character to "watch" the output of shell commands
20 | |ls -al ~
21 | 
22 | # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20)
23 | http://example.org/With%20Spaces/
24 | 
25 | # You can do POST requests by writing the POST data behind the URL,
26 | # separated by a single space character. POST data is URL-encoded.
27 | http://example.com/search.cgi button=Search&q=something&category=4
28 | 
29 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 3. The name of the author may not be used to endorse or promote products
14 |    derived from this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 


--------------------------------------------------------------------------------
/test/test_filters.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import yaml
 4 | from urlwatch.filters import FilterBase
 5 | from nose.tools import eq_
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def test_filters():
11 |     def check_filter(test_name):
12 |         filter = filter_tests[test_name]['filter']
13 |         data = filter_tests[test_name]['data']
14 |         expected_result = filter_tests[test_name]['expected_result']
15 |         if isinstance(filter, dict):
16 |             key = next(iter(filter))
17 |             kind, subfilter = key, filter[key]
18 |         elif isinstance(filter, str):
19 |             if ',' in filter:
20 |                 raise ValueError('Only single filter allowed in this test')
21 |             elif ':' in filter:
22 |                 kind, subfilter = filter.split(':', 1)
23 |             else:
24 |                 kind = filter
25 |                 subfilter = None
26 |         logger.info('filter kind: %s, subfilter: %s', kind, subfilter)
27 |         filtercls = FilterBase.__subclasses__.get(kind)
28 |         if filtercls is None:
29 |             raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter))
30 |         result = filtercls(None, None).filter(data, subfilter)
31 |         logger.debug('Expected result:\n%s', expected_result)
32 |         logger.debug('Actual result:\n%s', result)
33 |         eq_(result, expected_result)
34 | 
35 |     with open(os.path.join(os.path.dirname(__file__), 'data/filter_tests.yaml'), 'r', encoding='utf8') as fp:
36 |         filter_tests = yaml.load(fp, Loader=yaml.SafeLoader)
37 |     for test_name in filter_tests:
38 |         yield check_filter, test_name
39 | 


--------------------------------------------------------------------------------
/share/urlwatch/examples/urls.yaml.example:
--------------------------------------------------------------------------------
 1 | # This is an example urls.yaml file for urlwatch
 2 | 
 3 | # A basic URL job just needs a URL
 4 | name: "urlwatch webpage"
 5 | url: "https://thp.io/2008/urlwatch/"
 6 | # You can use a pre-supplied filter for this, here we apply two:
 7 | # the html2text filter that converts the HTML to plaintext and
 8 | # the grep filter that filters lines based on a regular expression
 9 | filter: html2text,grep:Current.*version,strip
10 | ---
11 | # Built-in job kind "shell" needs a command specified
12 | name: "Home Listing"
13 | command: "ls -al ~"
14 | #---
15 | #name: "Login to some webpage (custom job)"
16 | #url: "http://example.org/"
17 | # This job kind is defined in hooks.py, so you need to enable it
18 | #kind: custom-login
19 | # Additional parameters for the custom-login job kind can be specified here
20 | #username: "myuser"
21 | #password: "secret"
22 | # Filters can be specified here, separated by comma (these are also from hooks.py)
23 | #filter: case:upper,indent:5
24 | ---
25 | # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20)
26 | url: "http://example.org/With%20Spaces/"
27 | ---
28 | # POST requests are done by providing a post parameter
29 | url: "http://example.com/search.cgi"
30 | data: "button=Search&q=something&category=4"
31 | ---
32 | # You can use a custom HTTP method, this might be useful for cache invalidation
33 | url: "http://example.com/foo"
34 | method: "PURGE"
35 | ---
36 | # You can do POST requests by providing data parameter.
37 | # POST data can be a URL-encoded string (see last example) or a dict.
38 | url: "http://example.com/search.cgi"
39 | data:
40 |   button: Search
41 |   q: something
42 |   category: 4
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from setuptools import setup
 4 | from distutils import cmd
 5 | 
 6 | import os
 7 | import re
 8 | import sys
 9 | 
10 | main_py = open(os.path.join('lib', 'urlwatch', '__init__.py')).read()
11 | m = dict(re.findall("\n__([a-z]+)__ = '([^']+)'", main_py))
12 | docs = re.findall('"""(.*?)"""', main_py, re.DOTALL)
13 | 
14 | if sys.version_info < (3, 3):
15 |     sys.exit('urlwatch requires Python 3.3 or newer')
16 | 
17 | m['name'] = 'urlwatch'
18 | m['author'], m['author_email'] = re.match(r'(.*) <(.*)>', m['author']).groups()
19 | m['description'], m['long_description'] = docs[0].strip().split('\n\n', 1)
20 | m['install_requires'] = ['minidb', 'PyYAML', 'requests', 'keyring', 'pycodestyle', 'appdirs', 'lxml', 'cssselect', 'pytesseract', 'Pillow']
21 | if sys.version_info < (3, 4):
22 |     m['install_requires'].extend(['enum34'])
23 | if sys.platform == 'win32':
24 |     m['install_requires'].extend(['colorama'])
25 | m['entry_points'] = {"console_scripts": ["urlwatch=urlwatch.cli:main"]}
26 | m['package_dir'] = {'': 'lib'}
27 | m['packages'] = ['urlwatch']
28 | m['python_requires'] = '>3.3.0'
29 | m['data_files'] = [
30 |     ('share/man/man1', ['share/man/man1/urlwatch.1']),
31 |     ('share/urlwatch/examples', [
32 |         'share/urlwatch/examples/hooks.py.example',
33 |         'share/urlwatch/examples/urls.yaml.example',
34 |     ]),
35 | ]
36 | 
37 | 
38 | class InstallDependencies(cmd.Command):
39 |     """Install dependencies only"""
40 | 
41 |     description = 'Only install required packages using pip'
42 |     user_options = []
43 | 
44 |     def initialize_options(self):
45 |         ...
46 | 
47 |     def finalize_options(self):
48 |         ...
49 | 
50 |     def run(self):
51 |         global m
52 |         try:
53 |             from pip._internal import main
54 |         except ImportError:
55 |             from pip import main
56 |         try:
57 |             main(['install', '--upgrade'] + m['install_requires'])
58 |         except TypeError:  # recent pip
59 |             main.main(['install', '--upgrade'] + m['install_requires'])
60 | 
61 | 
62 | m['cmdclass'] = {'install_dependencies': InstallDependencies}
63 | 
64 | del m['copyright']
65 | setup(**m)
66 | 


--------------------------------------------------------------------------------
/share/man/man1/urlwatch.1:
--------------------------------------------------------------------------------
 1 | .TH URLWATCH "1" "January 2019" "urlwatch 2.16" "User Commands"
 2 | .SH NAME
 3 | urlwatch \- monitors webpages for you
 4 | .SH SYNOPSIS
 5 | .B urlwatch [options]
 6 | .SH DESCRIPTION
 7 | urlwatch is intended to help you watch changes in webpages and get notified
 8 | (via e\-mail, in your terminal or through various third party services) of any
 9 | changes. The change notification will include the URL that has changed and
10 | a unified diff of what has changed.
11 | .SS "optional arguments:"
12 | .TP
13 | \fB\-h\fR, \fB\-\-help\fR
14 | show this help message and exit
15 | .TP
16 | \fB\-\-version\fR
17 | show program's version number and exit
18 | .TP
19 | \fB\-v\fR, \fB\-\-verbose\fR
20 | show debug output
21 | .SS "files and directories:"
22 | .TP
23 | \fB\-\-urls\fR FILE
24 | read job list (URLs) from FILE
25 | .TP
26 | \fB\-\-config\fR FILE
27 | read configuration from FILE
28 | .TP
29 | \fB\-\-hooks\fR FILE
30 | use FILE as hooks.py module
31 | .TP
32 | \fB\-\-cache\fR FILE
33 | use FILE as cache database
34 | .SS "Authentication:"
35 | .TP
36 | \fB\-\-smtp\-login\fR
37 | Enter password for SMTP (store in keyring)
38 | .TP
39 | \fB\-\-telegram\-chats\fR
40 | List telegram chats the bot is joined to
41 | .TP
42 | \fB\-\-test\-slack\fR
43 | Send a test notification to Slack
44 | .SS "job list management:"
45 | .TP
46 | \fB\-\-list\fR
47 | list jobs
48 | .TP
49 | \fB\-\-add\fR JOB
50 | add job (key1=value1,key2=value2,...)
51 | .TP
52 | \fB\-\-delete\fR JOB
53 | delete job by location or index
54 | .TP
55 | \fB\-\-test\-filter\fR JOB
56 | test filter output of job by location or index
57 | .SS "interactive commands ($EDITOR/$VISUAL):"
58 | .TP
59 | \fB\-\-edit\fR
60 | edit URL/job list
61 | .TP
62 | \fB\-\-edit\-config\fR
63 | edit configuration file
64 | .TP
65 | \fB\-\-edit\-hooks\fR
66 | edit hooks script
67 | .SS "miscellaneous:"
68 | .TP
69 | \fB\-\-features\fR
70 | list supported jobs/filters/reporters
71 | .TP
72 | \fB\-\-gc\-cache\fR
73 | remove old cache entries
74 | .SH "FILES"
75 | .TP
76 | .B $XDG_CONFIG_HOME/urlwatch/urls.yaml
77 | A list of URLs, commands and other jobs to watch
78 | .TP
79 | .B $XDG_CONFIG_HOME/urlwatch/hooks.py
80 | A Python module that can implement new job types, filters and reporters
81 | .TP
82 | .B $XDG_CACHE_HOME/urlwatch/cache.db
83 | A SQLite 3 database that contains the state history of jobs (for diffing)
84 | .SH AUTHOR
85 | Thomas Perl <thp.io/about>
86 | .SH WEBSITE
87 | https://thp.io/2008/urlwatch/
88 | 


--------------------------------------------------------------------------------
/lib/urlwatch/ical2txt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
 4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
 5 | # All rights reserved.
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions
 9 | # are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright
12 | #    notice, this list of conditions and the following disclaimer.
13 | # 2. Redistributions in binary form must reproduce the above copyright
14 | #    notice, this list of conditions and the following disclaimer in the
15 | #    documentation and/or other materials provided with the distribution.
16 | # 3. The name of the author may not be used to endorse or promote products
17 | #    derived from this software without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | 
31 | def ical2text(ical_string):
32 |     import vobject
33 |     result = []
34 |     if isinstance(ical_string, str):
35 |         parsedCal = vobject.readOne(ical_string)
36 |     else:
37 |         try:
38 |             parsedCal = vobject.readOne(ical_string)
39 |         except Exception as e:
40 |             parsedCal = vobject.readOne(ical_string.decode('utf-8', 'ignore'))
41 | 
42 |     for event in parsedCal.getChildren():
43 |         if event.name == 'VEVENT':
44 |             if hasattr(event, 'dtstart'):
45 |                 start = event.dtstart.value.strftime('%F %H:%M')
46 |             else:
47 |                 start = 'unknown start date'
48 | 
49 |             if hasattr(event, 'dtend'):
50 |                 end = event.dtend.value.strftime('%F %H:%M')
51 |             else:
52 |                 end = start
53 | 
54 |             if start == end:
55 |                 date_str = start
56 |             else:
57 |                 date_str = '%s -- %s' % (start, end)
58 | 
59 |             result.append('%s: %s' % (date_str, event.summary.value))
60 | 
61 |     return '\n'.join(result)
62 | 


--------------------------------------------------------------------------------
/lib/urlwatch/migration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
 4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
 5 | # All rights reserved.
 6 | #
 7 | # Redistribution and use in source and binary forms, with or without
 8 | # modification, are permitted provided that the following conditions
 9 | # are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright
12 | #    notice, this list of conditions and the following disclaimer.
13 | # 2. Redistributions in binary form must reproduce the above copyright
14 | #    notice, this list of conditions and the following disclaimer in the
15 | #    documentation and/or other materials provided with the distribution.
16 | # 3. The name of the author may not be used to endorse or promote products
17 | #    derived from this software without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | 
31 | import logging
32 | import os.path
33 | import sys
34 | 
35 | from .util import atomic_rename
36 | from .storage import UrlsYaml, UrlsTxt, CacheDirStorage
37 | 
38 | logger = logging.getLogger(__name__)
39 | 
40 | 
41 | def migrate_urls(urlwatcher):
42 |     # Migrate urlwatch 1.x URLs to urlwatch 2.x
43 | 
44 |     urlwatch_config = urlwatcher.urlwatch_config
45 |     pkgname = urlwatch_config.pkgname
46 |     urls = urlwatch_config.urls
47 |     urls_txt = os.path.join(urlwatch_config.urlwatch_dir, 'urls.txt')
48 |     edit = urlwatch_config.edit
49 |     add = urlwatch_config.add
50 |     features = urlwatch_config.features
51 |     edit_hooks = urlwatch_config.edit_hooks
52 |     edit_config = urlwatch_config.edit_config
53 |     gc_cache = urlwatch_config.gc_cache
54 | 
55 |     if os.path.isfile(urls_txt) and not os.path.isfile(urls):
56 |         print("""
57 |     Migrating URLs: {urls_txt} -> {urls_yaml}
58 |     Use "{pkgname} --edit" to customize it.
59 |     """.format(urls_txt=urls_txt, urls_yaml=urls, pkgname=pkgname))
60 |         UrlsYaml(urls).save(UrlsTxt(urls_txt).load_secure())
61 |         atomic_rename(urls_txt, urls_txt + '.migrated')
62 | 
63 |     if not any([os.path.isfile(urls), edit, add, features, edit_hooks, edit_config, gc_cache]):
64 |         print("""
65 |     You need to create {urls_yaml} in order to use {pkgname}.
66 |     Use "{pkgname} --edit" to open the file with your editor.
67 |     """.format(urls_yaml=urls, pkgname=pkgname))
68 |         sys.exit(1)
69 | 
70 | 
71 | def migrate_cache(urlwatcher):
72 |     # Migrate urlwatch 1.x cache to urlwatch 2.x
73 | 
74 |     urlwatch_config = urlwatcher.urlwatch_config
75 |     cache = urlwatch_config.cache
76 |     cache_dir = os.path.join(urlwatch_config.urlwatch_dir, 'cache')
77 | 
78 |     # On Windows and macOS with case-insensitive filesystems, we have to check if
79 |     # "cache.db" exists in the folder, and in this case, avoid migration (Issue #223)
80 |     if os.path.isdir(cache_dir) and not os.path.isfile(os.path.join(cache_dir, 'cache.db')):
81 |         print("""
82 |     Migrating cache: {cache_dir} -> {cache_db}
83 |     """.format(cache_dir=cache_dir, cache_db=cache))
84 | 
85 |         old_cache_storage = CacheDirStorage(cache_dir)
86 |         urlwatcher.cache_storage.restore(old_cache_storage.backup())
87 |         urlwatcher.cache_storage.gc([job.get_guid() for job in urlwatcher.jobs])
88 |         atomic_rename(cache_dir, cache_dir + '.migrated')
89 | 


--------------------------------------------------------------------------------
/lib/urlwatch/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import imp
 32 | import logging
 33 | import os
 34 | 
 35 | from .handler import Report
 36 | from .worker import run_jobs
 37 | 
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | class Urlwatch(object):
 42 |     def __init__(self, urlwatch_config, config_storage, cache_storage, urls_storage):
 43 | 
 44 |         self.urlwatch_config = urlwatch_config
 45 | 
 46 |         logger.info('Using %s as URLs file', self.urlwatch_config.urls)
 47 |         logger.info('Using %s for hooks', self.urlwatch_config.hooks)
 48 |         logger.info('Using %s as cache database', self.urlwatch_config.cache)
 49 | 
 50 |         self.config_storage = config_storage
 51 |         self.cache_storage = cache_storage
 52 |         self.urls_storage = urls_storage
 53 | 
 54 |         self.report = Report(self)
 55 |         self.jobs = None
 56 | 
 57 |         self.check_directories()
 58 | 
 59 |         if hasattr(self.urlwatch_config, 'migrate_urls'):
 60 |             self.urlwatch_config.migrate_urls(self)
 61 | 
 62 |         if not self.urlwatch_config.edit_hooks:
 63 |             self.load_hooks()
 64 | 
 65 |         if not self.urlwatch_config.edit:
 66 |             self.load_jobs()
 67 | 
 68 |         if hasattr(self.urlwatch_config, 'migrate_urls'):
 69 |             self.urlwatch_config.migrate_cache(self)
 70 | 
 71 |     def check_directories(self):
 72 |         if not os.path.isdir(self.urlwatch_config.urlwatch_dir):
 73 |             os.makedirs(self.urlwatch_config.urlwatch_dir)
 74 |         if not os.path.exists(self.urlwatch_config.config):
 75 |             self.config_storage.write_default_config(self.urlwatch_config.config)
 76 |             print("""
 77 |     A default config has been written to {config_yaml}.
 78 |     Use "{pkgname} --edit-config" to customize it.
 79 |         """.format(config_yaml=self.urlwatch_config.config, pkgname=self.urlwatch_config.pkgname))
 80 | 
 81 |     def load_hooks(self):
 82 |         if os.path.exists(self.urlwatch_config.hooks):
 83 |             imp.load_source('hooks', self.urlwatch_config.hooks)
 84 | 
 85 |     def load_jobs(self):
 86 |         if os.path.isfile(self.urlwatch_config.urls):
 87 |             jobs = self.urls_storage.load_secure()
 88 |             logger.info('Found {0} jobs'.format(len(jobs)))
 89 |         else:
 90 |             logger.warn('No jobs file found')
 91 |             jobs = []
 92 | 
 93 |         self.jobs = jobs
 94 | 
 95 |     def run_jobs(self):
 96 |         run_jobs(self)
 97 | 
 98 |     def close(self):
 99 |         self.report.finish()
100 |         self.cache_storage.close()
101 | 


--------------------------------------------------------------------------------
/lib/urlwatch/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import logging
 32 | import os
 33 | import platform
 34 | import subprocess
 35 | import shlex
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | 
 40 | class TrackSubClasses(type):
 41 |     """A metaclass that stores subclass name-to-class mappings in the base class"""
 42 | 
 43 |     @staticmethod
 44 |     def sorted_by_kind(cls):
 45 |         return [item for _, item in sorted((it.__kind__, it) for it in cls.__subclasses__.values())]
 46 | 
 47 |     def __init__(cls, name, bases, namespace):
 48 |         for base in bases:
 49 |             if base == object:
 50 |                 continue
 51 | 
 52 |             for attr in ('__required__', '__optional__'):
 53 |                 if not hasattr(base, attr):
 54 |                     continue
 55 | 
 56 |                 inherited = getattr(base, attr, ())
 57 |                 new_value = tuple(namespace.get(attr, ())) + tuple(inherited)
 58 |                 namespace[attr] = new_value
 59 |                 setattr(cls, attr, new_value)
 60 | 
 61 |         for base in bases:
 62 |             if base == object:
 63 |                 continue
 64 | 
 65 |             if hasattr(cls, '__kind__'):
 66 |                 subclasses = getattr(base, '__subclasses__', None)
 67 |                 if subclasses is not None:
 68 |                     logger.info('Registering %r as %s', cls, cls.__kind__)
 69 |                     subclasses[cls.__kind__] = cls
 70 |                     break
 71 |             else:
 72 |                 anonymous_subclasses = getattr(base, '__anonymous_subclasses__', None)
 73 |                 if anonymous_subclasses is not None:
 74 |                     logger.info('Registering %r', cls)
 75 |                     anonymous_subclasses.append(cls)
 76 |                     break
 77 | 
 78 |         super().__init__(name, bases, namespace)
 79 | 
 80 | 
 81 | def atomic_rename(old_filename, new_filename):
 82 |     if platform.system() == 'Windows' and os.path.exists(new_filename):
 83 |         new_old_filename = new_filename + '.bak'
 84 |         if os.path.exists(new_old_filename):
 85 |             os.remove(new_old_filename)
 86 |         os.rename(new_filename, new_old_filename)
 87 |         os.rename(old_filename, new_filename)
 88 |         if os.path.exists(new_old_filename):
 89 |             os.remove(new_old_filename)
 90 |     else:
 91 |         os.rename(old_filename, new_filename)
 92 | 
 93 | 
 94 | def edit_file(filename):
 95 |     editor = os.environ.get('EDITOR', None)
 96 |     if not editor:
 97 |         editor = os.environ.get('VISUAL', None)
 98 |     if not editor:
 99 |         raise SystemExit('Please set $VISUAL or $EDITOR.')
100 | 
101 |     subprocess.check_call(shlex.split(editor) + [filename])
102 | 


--------------------------------------------------------------------------------
/share/urlwatch/examples/hooks.py.example:
--------------------------------------------------------------------------------
  1 | #
  2 | # Example hooks file for urlwatch
  3 | #
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | # 1. Redistributions of source code must retain the above copyright
 11 | #    notice, this list of conditions and the following disclaimer.
 12 | # 2. Redistributions in binary form must reproduce the above copyright
 13 | #    notice, this list of conditions and the following disclaimer in the
 14 | #    documentation and/or other materials provided with the distribution.
 15 | # 3. The name of the author may not be used to endorse or promote products
 16 | #    derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 19 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 21 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 23 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | #
 29 | 
 30 | import re
 31 | 
 32 | from urlwatch import filters
 33 | from urlwatch import jobs
 34 | from urlwatch import reporters
 35 | 
 36 | 
 37 | #class CustomLoginJob(jobs.UrlJob):
 38 | #    """Custom login for my webpage"""
 39 | #
 40 | #    __kind__ = 'custom-login'
 41 | #    __required__ = ('username', 'password')
 42 | #
 43 | #    def retrieve(self, job_state):
 44 | #        return 'Would log in to {} with {} and {}\n'.format(self.url, self.username, self.password)
 45 | 
 46 | 
 47 | #class CaseFilter(filters.FilterBase):
 48 | #    """Custom filter for changing case, needs to be selected manually"""
 49 | #
 50 | #    __kind__ = 'case'
 51 | #
 52 | #    def filter(self, data, subfilter=None):
 53 | #        # The subfilter is specified using a colon, for example the "case"
 54 | #        # filter here can be specified as "case:upper" and "case:lower"
 55 | #
 56 | #        if subfilter is None:
 57 | #            subfilter = 'upper'
 58 | #
 59 | #        if subfilter == 'upper':
 60 | #            return data.upper()
 61 | #        elif subfilter == 'lower':
 62 | #            return data.lower()
 63 | #        else:
 64 | #            raise ValueError('Unknown case subfilter: %r' % (subfilter,))
 65 | 
 66 | 
 67 | #class IndentFilter(filters.FilterBase):
 68 | #    """Custom filter for indenting, needs to be selected manually"""
 69 | #
 70 | #    __kind__ = 'indent'
 71 | #
 72 | #    def filter(self, data, subfilter=None):
 73 | #        # The subfilter here is a number of characters to indent
 74 | #
 75 | #        if subfilter is None:
 76 | #            indent = 8
 77 | #        else:
 78 | #            indent = int(subfilter)
 79 | #
 80 | #        return '\n'.join((' '*indent) + line for line in data.splitlines())
 81 | 
 82 | 
 83 | 
 84 | class CustomMatchUrlFilter(filters.AutoMatchFilter):
 85 |     # The AutoMatchFilter will apply automatically to all filters
 86 |     # that have the given properties set
 87 |     MATCH = {'url': 'http://example.org/'}
 88 | 
 89 |     def filter(self, data):
 90 |         return data.replace('foo', 'bar')
 91 | 
 92 | class CustomRegexMatchUrlFilter(filters.RegexMatchFilter):
 93 |     # Similar to AutoMatchFilter
 94 |     MATCH = {'url': re.compile('http://example.org/.*')}
 95 | 
 96 |     def filter(self, data):
 97 |         return data.replace('foo', 'bar')
 98 | 
 99 | 
100 | class CustomTextFileReporter(reporters.TextReporter):
101 |     """Custom reporter that writes the text-only report to a file"""
102 | 
103 |     __kind__ = 'custom_file'
104 | 
105 |     def submit(self):
106 |         with open(self.config['filename'], 'w') as fp:
107 |             fp.write('\n'.join(super().submit()))
108 | 
109 | 
110 | class CustomHtmlFileReporter(reporters.HtmlReporter):
111 |     """Custom reporter that writes the HTML report to a file"""
112 | 
113 |     __kind__ = 'custom_html'
114 | 
115 |     def submit(self):
116 |         with open(self.config['filename'], 'w') as fp:
117 |             fp.write('\n'.join(super().submit()))
118 | 


--------------------------------------------------------------------------------
/lib/urlwatch/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  5 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  6 | # All rights reserved.
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions
 10 | # are met:
 11 | #
 12 | # 1. Redistributions of source code must retain the above copyright
 13 | #    notice, this list of conditions and the following disclaimer.
 14 | # 2. Redistributions in binary form must reproduce the above copyright
 15 | #    notice, this list of conditions and the following disclaimer in the
 16 | #    documentation and/or other materials provided with the distribution.
 17 | # 3. The name of the author may not be used to endorse or promote products
 18 | #    derived from this software without specific prior written permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 21 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 22 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 23 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 24 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 25 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 29 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | 
 32 | # File and folder paths
 33 | import logging
 34 | import os.path
 35 | import signal
 36 | import socket
 37 | import sys
 38 | 
 39 | from appdirs import AppDirs
 40 | 
 41 | pkgname = 'urlwatch'
 42 | urlwatch_dir = os.path.expanduser(os.path.join('~', '.' + pkgname))
 43 | urlwatch_cache_dir = AppDirs(pkgname).user_cache_dir
 44 | 
 45 | if not os.path.exists(urlwatch_dir):
 46 |     urlwatch_dir = AppDirs(pkgname).user_config_dir
 47 | 
 48 | # Check if we are installed in the system already
 49 | (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
 50 | 
 51 | if bindir != 'bin':
 52 |     sys.path.insert(0, os.path.join(prefix, bindir, 'lib'))
 53 | 
 54 | from urlwatch.command import UrlwatchCommand
 55 | from urlwatch.config import CommandConfig
 56 | from urlwatch.main import Urlwatch
 57 | from urlwatch.storage import YamlConfigStorage, CacheMiniDBStorage, UrlsYaml
 58 | 
 59 | # One minute (=60 seconds) timeout for each request to avoid hanging
 60 | socket.setdefaulttimeout(60)
 61 | 
 62 | # Ignore SIGPIPE for stdout (see https://github.com/thp/urlwatch/issues/77)
 63 | try:
 64 |     signal.signal(signal.SIGPIPE, signal.SIG_DFL)
 65 | except AttributeError:
 66 |     # Windows does not have signal.SIGPIPE
 67 |     ...
 68 | 
 69 | logger = logging.getLogger(pkgname)
 70 | 
 71 | CONFIG_FILE = 'urlwatch.yaml'
 72 | URLS_FILE = 'urls.yaml'
 73 | CACHE_FILE = 'cache.db'
 74 | HOOKS_FILE = 'hooks.py'
 75 | 
 76 | 
 77 | def setup_logger(verbose):
 78 |     if verbose:
 79 |         root_logger = logging.getLogger('')
 80 |         console = logging.StreamHandler()
 81 |         console.setFormatter(logging.Formatter('%(asctime)s %(module)s %(levelname)s: %(message)s'))
 82 |         root_logger.addHandler(console)
 83 |         root_logger.setLevel(logging.DEBUG)
 84 |         root_logger.info('turning on verbose logging mode')
 85 | 
 86 | 
 87 | def main():
 88 |     config_file = os.path.join(urlwatch_dir, CONFIG_FILE)
 89 |     urls_file = os.path.join(urlwatch_dir, URLS_FILE)
 90 |     hooks_file = os.path.join(urlwatch_dir, HOOKS_FILE)
 91 |     new_cache_file = os.path.join(urlwatch_cache_dir, CACHE_FILE)
 92 |     old_cache_file = os.path.join(urlwatch_dir, CACHE_FILE)
 93 |     cache_file = new_cache_file
 94 |     if os.path.exists(old_cache_file) and not os.path.exists(new_cache_file):
 95 |         cache_file = old_cache_file
 96 | 
 97 |     command_config = CommandConfig(pkgname, urlwatch_dir, bindir, prefix,
 98 |                                    config_file, urls_file, hooks_file, cache_file, False)
 99 |     setup_logger(command_config.verbose)
100 | 
101 |     # setup storage API
102 |     config_storage = YamlConfigStorage(command_config.config)
103 |     cache_storage = CacheMiniDBStorage(command_config.cache)
104 |     urls_storage = UrlsYaml(command_config.urls)
105 | 
106 |     # setup urlwatcher
107 |     urlwatch = Urlwatch(command_config, config_storage, cache_storage, urls_storage)
108 |     urlwatch_command = UrlwatchCommand(urlwatch)
109 | 
110 |     # run urlwatcher
111 |     urlwatch_command.run()
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/lib/urlwatch/mailer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import smtplib
 32 | import getpass
 33 | import subprocess
 34 | import logging
 35 | 
 36 | try:
 37 |     import keyring
 38 | except ImportError:
 39 |     keyring = None
 40 | 
 41 | import email.mime.multipart
 42 | import email.mime.text
 43 | import email.utils
 44 | 
 45 | logger = logging.getLogger(__name__)
 46 | 
 47 | 
 48 | class Mailer(object):
 49 |     def send(self, msg):
 50 |         raise NotImplementedError
 51 | 
 52 |     def msg_plain(self, from_email, to_email, subject, body):
 53 |         msg = email.mime.text.MIMEText(body, 'plain', 'utf-8')
 54 |         msg['Subject'] = subject
 55 |         msg['From'] = from_email
 56 |         msg['To'] = to_email
 57 |         msg['Date'] = email.utils.formatdate()
 58 | 
 59 |         return msg
 60 | 
 61 |     def msg_html(self, from_email, to_email, subject, body_text, body_html):
 62 |         msg = email.mime.multipart.MIMEMultipart('alternative')
 63 |         msg['Subject'] = subject
 64 |         msg['From'] = from_email
 65 |         msg['To'] = to_email
 66 |         msg['Date'] = email.utils.formatdate()
 67 | 
 68 |         msg.attach(email.mime.text.MIMEText(body_text, 'plain', 'utf-8'))
 69 |         msg.attach(email.mime.text.MIMEText(body_html, 'html', 'utf-8'))
 70 | 
 71 |         return msg
 72 | 
 73 | 
 74 | class SMTPMailer(Mailer):
 75 |     def __init__(self, smtp_user, smtp_server, smtp_port, tls, auth, insecure_password=None):
 76 |         self.smtp_server = smtp_server
 77 |         self.smtp_user = smtp_user
 78 |         self.smtp_port = smtp_port
 79 |         self.tls = tls
 80 |         self.auth = auth
 81 |         self.insecure_password = insecure_password
 82 | 
 83 |     def send(self, msg):
 84 |         s = smtplib.SMTP(self.smtp_server, self.smtp_port)
 85 |         s.ehlo()
 86 | 
 87 |         if self.tls:
 88 |             s.starttls()
 89 | 
 90 |         if self.auth:
 91 |             if self.insecure_password:
 92 |                 passwd = self.insecure_password
 93 |             elif keyring is not None:
 94 |                 passwd = keyring.get_password(self.smtp_server, self.smtp_user)
 95 |                 if passwd is None:
 96 |                     raise ValueError('No password available in keyring for {}, {}'.format(self.smtp_server, self.smtp_user))
 97 |             s.login(self.smtp_user, passwd)
 98 | 
 99 |         s.sendmail(msg['From'], msg['To'].split(','), msg.as_string())
100 |         s.quit()
101 | 
102 | 
103 | class SendmailMailer(Mailer):
104 |     def __init__(self, sendmail_path):
105 |         self.sendmail_path = sendmail_path
106 | 
107 |     def send(self, msg):
108 |         p = subprocess.Popen([self.sendmail_path, '-oi', msg['To']],
109 |                              stdin=subprocess.PIPE,
110 |                              stderr=subprocess.PIPE,
111 |                              universal_newlines=True)
112 |         result = p.communicate(msg.as_string())
113 |         if p.returncode:
114 |             logger.error('Sendmail failed with {result}'.format(result=result))
115 | 
116 | 
117 | def have_password(smtp_server, from_email):
118 |     return keyring.get_password(smtp_server, from_email) is not None
119 | 
120 | 
121 | def set_password(smtp_server, from_email):
122 |     ''' Set the keyring password for the mail connection. Interactive.'''
123 |     if keyring is None:
124 |         raise ImportError('keyring module missing - service unsupported')
125 | 
126 |     password = getpass.getpass(prompt='Enter password for {} using {}: '.format(from_email, smtp_server))
127 |     keyring.set_password(smtp_server, from_email, password)
128 | 


--------------------------------------------------------------------------------
/lib/urlwatch/worker.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import concurrent.futures
 32 | import logging
 33 | import difflib
 34 | 
 35 | import requests
 36 | 
 37 | from .handler import JobState
 38 | from .jobs import NotModifiedError
 39 | 
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | MAX_WORKERS = 10
 43 | 
 44 | 
 45 | def run_parallel(func, items):
 46 |     executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
 47 |     for future in concurrent.futures.as_completed(executor.submit(func, item) for item in items):
 48 |         exception = future.exception()
 49 |         if exception is not None:
 50 |             raise exception
 51 |         yield future.result()
 52 | 
 53 | 
 54 | def run_jobs(urlwatcher):
 55 |     cache_storage = urlwatcher.cache_storage
 56 |     jobs = [job.with_defaults(urlwatcher.config_storage.config)
 57 |             for job in urlwatcher.jobs]
 58 |     report = urlwatcher.report
 59 | 
 60 |     resources = {}
 61 |     for job in jobs:
 62 |         job.request_resources(resources)
 63 | 
 64 |     logger.debug('Processing %d jobs', len(jobs))
 65 |     for job_state in run_parallel(lambda job_state: job_state.process(),
 66 |                                   (JobState(cache_storage, resources, job) for job in jobs)):
 67 |         logger.debug('Job finished: %s', job_state.job)
 68 | 
 69 |         if not job_state.job.max_tries:
 70 |             max_tries = 0
 71 |         else:
 72 |             max_tries = job_state.job.max_tries
 73 |         logger.debug('Using max_tries of %i for %s', max_tries, job_state.job)
 74 | 
 75 |         if job_state.exception is not None:
 76 |             if job_state.error_ignored:
 77 |                 logger.info('Error while executing job %s ignored due to job config', job_state.job)
 78 |             elif isinstance(job_state.exception, NotModifiedError):
 79 |                 logger.info('Job %s has not changed (HTTP 304)', job_state.job)
 80 |                 report.unchanged(job_state)
 81 |                 if job_state.tries > 0:
 82 |                     job_state.tries = 0
 83 |                     job_state.save()
 84 |             elif job_state.tries < max_tries:
 85 |                 logger.debug('This was try %i of %i for job %s', job_state.tries,
 86 |                              max_tries, job_state.job)
 87 |                 job_state.save()
 88 |             elif job_state.tries >= max_tries:
 89 |                 logger.debug('We are now at %i tries ', job_state.tries)
 90 |                 job_state.save()
 91 |                 report.error(job_state)
 92 | 
 93 |         elif job_state.old_data is not None:
 94 |             matched_history_time = job_state.history_data.get(job_state.new_data)
 95 |             if matched_history_time:
 96 |                 job_state.timestamp = matched_history_time
 97 |             if matched_history_time or job_state.new_data == job_state.old_data:
 98 |                 report.unchanged(job_state)
 99 |                 if job_state.tries > 0:
100 |                     job_state.tries = 0
101 |                     job_state.save()
102 |             else:
103 |                 close_matches = difflib.get_close_matches(job_state.new_data, job_state.history_data, n=1)
104 |                 if close_matches:
105 |                     job_state.old_data = close_matches[0]
106 |                     job_state.timestamp = job_state.history_data[close_matches[0]]
107 |                 report.changed(job_state)
108 |                 job_state.tries = 0
109 |                 job_state.save()
110 |         else:
111 |             report.new(job_state)
112 |             job_state.tries = 0
113 |             job_state.save()
114 | 
115 |     for job in jobs:
116 |         job.release_resources(resources)
117 | 


--------------------------------------------------------------------------------
/lib/urlwatch/html2txt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import re
 32 | import os
 33 | import subprocess
 34 | import logging
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | 
 39 | def html2text(data, method, options):
 40 |     """
 41 |     Convert a string consisting of HTML to plain text
 42 |     for easy difference checking.
 43 | 
 44 |     Method may be one of:
 45 |      'lynx'           - Use "lynx -dump" for conversion
 46 |                         options: see "lynx -help" output for options that work with "-dump"
 47 |      'html2text'      - Use "html2text -nobs" for conversion
 48 |                         options: https://linux.die.net/man/1/html2text
 49 |      'bs4'            - Use Beautiful Soup library to prettify the HTML
 50 |                         options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser"
 51 |                         http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use
 52 |      're'             - A simple regex-based HTML tag stripper
 53 |      'pyhtml2text'    - Use Python module "html2text"
 54 |                         options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options
 55 |     """
 56 |     if method == 're':
 57 |         stripped_tags = re.sub(r'<[^>]*>', '', data)
 58 |         d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
 59 |         return d
 60 | 
 61 |     if method == 'pyhtml2text':
 62 |         import html2text
 63 |         parser = html2text.HTML2Text()
 64 |         for k, v in options.items():
 65 |             setattr(parser, k.lower(), v)
 66 |         d = parser.handle(data)
 67 |         return d
 68 | 
 69 |     if method == 'bs4':
 70 |         from bs4 import BeautifulSoup
 71 |         parser = options.pop('parser', 'html.parser')
 72 |         soup = BeautifulSoup(data, parser)
 73 |         d = soup.get_text(strip=True)
 74 |         return d
 75 | 
 76 |     if method == 'lynx':
 77 |         cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8']
 78 |     elif method == 'html2text':
 79 |         cmd = ['html2text', '-nobs', '-utf8']
 80 |     else:
 81 |         raise ValueError('Unknown html2text method: %r' % (method,))
 82 | 
 83 |     stdout_encoding = 'utf-8'
 84 | 
 85 |     for k, v in options.items():
 86 |         cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k)
 87 | 
 88 |     logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding)
 89 | 
 90 |     env = {}
 91 |     env.update(os.environ)
 92 |     env['LANG'] = 'en_US.utf-8'
 93 |     env['LC_ALL'] = 'en_US.utf-8'
 94 | 
 95 |     html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env)
 96 |     stdout, stderr = html2text.communicate(data.encode('utf-8'))
 97 |     stdout = stdout.decode(stdout_encoding)
 98 | 
 99 |     if method == 'lynx':
100 |         # Lynx translates relative links in the mode we use it to:
101 |         # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]
102 | 
103 |         # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
104 |         # "localhost" in the file:// URLs; see Debian bug 732112
105 |         stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)
106 | 
107 |         # Use the following regular expression to remove the unnecessary
108 |         # parts, so that [RANDOM STRING] (changing on each call) does not
109 |         # expose itself as change on the website (it's a Lynx-related thing
110 |         # Thanks to Evert Meulie for pointing that out
111 |         stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)
112 |         # Also remove file names like L9816-5928TMP.html
113 |         stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)
114 | 
115 |     return stdout.strip()
116 | 


--------------------------------------------------------------------------------
/lib/urlwatch/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import argparse
 32 | import logging
 33 | import os
 34 | 
 35 | import urlwatch
 36 | from .migration import migrate_cache, migrate_urls
 37 | 
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | class BaseConfig(object):
 42 | 
 43 |     def __init__(self, pkgname, urlwatch_dir, config, urls, cache, hooks, verbose):
 44 |         self.pkgname = pkgname
 45 |         self.urlwatch_dir = urlwatch_dir
 46 |         self.config = config
 47 |         self.urls = urls
 48 |         self.cache = cache
 49 |         self.hooks = hooks
 50 |         self.verbose = verbose
 51 | 
 52 | 
 53 | class CommandConfig(BaseConfig):
 54 | 
 55 |     def __init__(self, pkgname, urlwatch_dir, bindir, prefix, config, urls, hooks, cache, verbose):
 56 |         super().__init__(pkgname, urlwatch_dir, config, urls, cache, hooks, verbose)
 57 |         self.bindir = bindir
 58 |         self.prefix = prefix
 59 |         self.migrate_cache = migrate_cache
 60 |         self.migrate_urls = migrate_urls
 61 | 
 62 |         if self.bindir == 'bin':
 63 |             # Installed system-wide
 64 |             self.examples_dir = os.path.join(prefix, 'share', self.pkgname, 'examples')
 65 |         else:
 66 |             # Assume we are not yet installed
 67 |             self.examples_dir = os.path.join(prefix, bindir, 'share', self.pkgname, 'examples')
 68 | 
 69 |         self.urls_yaml_example = os.path.join(self.examples_dir, 'urls.yaml.example')
 70 |         self.hooks_py_example = os.path.join(self.examples_dir, 'hooks.py.example')
 71 | 
 72 |         self.parse_args()
 73 | 
 74 |     def parse_args(self):
 75 | 
 76 |         parser = argparse.ArgumentParser(description=urlwatch.__doc__,
 77 |                                          formatter_class=argparse.RawDescriptionHelpFormatter)
 78 |         parser.add_argument('--version', action='version', version='%(prog)s {}'.format(urlwatch.__version__))
 79 |         parser.add_argument('-v', '--verbose', action='store_true', help='show debug output')
 80 |         group = parser.add_argument_group('files and directories')
 81 |         group.add_argument('--urls', metavar='FILE', help='read job list (URLs) from FILE',
 82 |                            default=self.urls)
 83 |         group.add_argument('--config', metavar='FILE', help='read configuration from FILE',
 84 |                            default=self.config)
 85 |         group.add_argument('--hooks', metavar='FILE', help='use FILE as hooks.py module',
 86 |                            default=self.hooks)
 87 |         group.add_argument('--cache', metavar='FILE', help='use FILE as cache database',
 88 |                            default=self.cache)
 89 | 
 90 |         group = parser.add_argument_group('Authentication')
 91 |         group.add_argument('--smtp-login', action='store_true', help='Enter password for SMTP (store in keyring)')
 92 |         group.add_argument('--telegram-chats', action='store_true', help='List telegram chats the bot is joined to')
 93 |         group.add_argument('--test-slack', action='store_true', help='Send a test notification to Slack')
 94 | 
 95 |         group = parser.add_argument_group('job list management')
 96 |         group.add_argument('--list', action='store_true', help='list jobs')
 97 |         group.add_argument('--add', metavar='JOB', help='add job (key1=value1,key2=value2,...)')
 98 |         group.add_argument('--delete', metavar='JOB', help='delete job by location or index')
 99 |         group.add_argument('--test-filter', metavar='JOB', help='test filter output of job by location or index')
100 |         group = parser.add_argument_group('interactive commands ($EDITOR/$VISUAL)')
101 |         group.add_argument('--edit', action='store_true', help='edit URL/job list')
102 |         group.add_argument('--edit-config', action='store_true', help='edit configuration file')
103 |         group.add_argument('--edit-hooks', action='store_true', help='edit hooks script')
104 |         group = parser.add_argument_group('miscellaneous')
105 |         group.add_argument('--features', action='store_true', help='list supported jobs/filters/reporters')
106 |         group.add_argument('--gc-cache', action='store_true', help='remove old cache entries')
107 |         group.add_argument('--report-timestamps', action='store_true', help='report job timestamps')
108 | 
109 |         args = parser.parse_args()
110 | 
111 |         for i, arg in enumerate(vars(args)):
112 |             argval = getattr(args, arg)
113 |             setattr(self, arg, argval)
114 | 


--------------------------------------------------------------------------------
/test/data/filter_tests.yaml:
--------------------------------------------------------------------------------
  1 | # <test-name>:
  2 | #   filter: <specs-of-a-single-filter-as-string-or-mapping>
  3 | #   data: |
  4 | #       Input data as block scalar (string).
  5 | #       Use the literal style (starts with "|") for better readability.
  6 | #       Use a chomping indicator (-/+) to control trailing newlines.
  7 | #       Ref:
  8 | #         https://yaml.org/spec/1.2/spec.html#id2795688
  9 | #         https://yaml.org/spec/1.2/spec.html#id2794534
 10 | #   expected_result: |
 11 | #       <Expected filtered data>
 12 | element_by_tag:
 13 |     filter: element-by-tag:body
 14 |     data: |
 15 |         <html><head></head><body>foo</body></html>
 16 |     expected_result: |-
 17 |         <body>foo</body>
 18 | element_by_tag_nested:
 19 |     filter: element-by-tag:div
 20 |     data: |
 21 |         <html><head></head><body>
 22 |         <div>foo</div>
 23 |         <div>bar</div>
 24 |         </body></html>
 25 |     expected_result: |-
 26 |         <div>foo</div><div>bar</div>
 27 | element_by_id:
 28 |     filter: element-by-id:bar
 29 |     data: |
 30 |         <html><head></head><body>
 31 |         <div id="foo">asdf <span>bar</span></div>
 32 |         <div id="bar">asdf <span>bar</span> hoho</div>
 33 |         </body></html>
 34 |     expected_result: |-
 35 |         <div id="bar">asdf <span>bar</span> hoho</div>
 36 | element_by_class:
 37 |     filter: element-by-class:foo
 38 |     data: |
 39 |         <html><head></head><body>
 40 |         <div class="foo">foo</div>
 41 |         <div class="bar">bar</div>
 42 |         </body></html>
 43 |     expected_result: |-
 44 |         <div class="foo">foo</div>
 45 | xpath_elements:
 46 |     filter: xpath://div | //*[@id="bar"]
 47 |     data: |
 48 |         <html><head></head><body>
 49 |         <div>foo</div>
 50 |         <div id="bar">bar</div>
 51 |         </body></html>
 52 |     expected_result: |
 53 |         <div>foo</div>
 54 |         
 55 |         <div id="bar">bar</div>
 56 | xpath_text:
 57 |     filter: xpath://div[1]/text() | //div[2]/@id
 58 |     data: |
 59 |         <html><head></head><body>
 60 |         <div>foo</div>
 61 |         <div id="bar">bar</div>
 62 |         </body></html>
 63 |     expected_result: |-
 64 |         foo
 65 |         bar
 66 | xpath_exclude:
 67 |     filter:
 68 |         xpath:
 69 |             path: //div
 70 |             exclude: //*[@class='excl'] | //*/@class
 71 |     data: |
 72 |         <html><head></head><body>
 73 |         <div class="excl">you don't want to see me</div>
 74 |         <div class="foo">f<span class="excl">interrupt!</span>o<span class="excl">interrupt!</span>o</div>
 75 |         <div id="bar">bar</div>
 76 |         </body></html>
 77 |     expected_result: |
 78 |         <div>foo</div>
 79 |         
 80 |         <div id="bar">bar</div>
 81 | xpath_xml_namespaces:
 82 |     filter:
 83 |         xpath:
 84 |             path: //item | //f:item
 85 |             method: xml
 86 |             exclude: //f:year | //author
 87 |             namespaces:
 88 |                 f: foo
 89 |     data: |
 90 |         <feed xmlns:f="foo">
 91 |         <item>
 92 |         <f:year>2017</f:year>
 93 |         <author>Tom</author>
 94 |         <data>abc</data>
 95 |         </item>
 96 |         <f:item>
 97 |         <year>2018</year>
 98 |         <f:author>Jerry</f:author>
 99 |         <data>xyz</data>
100 |         </f:item>
101 |         </feed>
102 |     expected_result: |
103 |         <item xmlns:f="foo">
104 |         
105 |         
106 |         <data>abc</data>
107 |         </item>
108 |         
109 |         <f:item xmlns:f="foo">
110 |         <year>2018</year>
111 |         <f:author>Jerry</f:author>
112 |         <data>xyz</data>
113 |         </f:item>
114 | css:
115 |     filter: css:div
116 |     data: |
117 |         <html><head></head><body>
118 |         <div>foo</div>
119 |         <div>bar</div>
120 |         </body></html>
121 |     expected_result: |
122 |         <div>foo</div>
123 |         
124 |         <div>bar</div>
125 | css_exclude:
126 |     filter:
127 |         css:
128 |             selector: div
129 |             exclude: '.excl, #bar'
130 |     data: |
131 |         <html><head></head><body>
132 |         <div class="excl">you don't want to see me</div>
133 |         <div class="foo">f<span class="excl">interrupt!</span>o<span class="excl">interrupt!</span>o</div>
134 |         <div id="bar">bar</div>
135 |         </body></html>
136 |     expected_result: |
137 |         <div class="foo">foo</div>
138 | css_xml_namespaces:
139 |     filter:
140 |         css:
141 |             selector: item, f|item
142 |             method: xml
143 |             exclude: f|year, author
144 |             namespaces:
145 |                 f: foo
146 |     data: |
147 |         <feed xmlns:f="foo">
148 |         <item>
149 |         <f:year>2017</f:year>
150 |         <author>Tom</author>
151 |         <data>abc</data>
152 |         </item>
153 |         <f:item>
154 |         <year>2018</year>
155 |         <f:author>Jerry</f:author>
156 |         <data>xyz</data>
157 |         </f:item>
158 |         </feed>
159 |     expected_result: |
160 |         <item xmlns:f="foo">
161 |         
162 |         
163 |         <data>abc</data>
164 |         </item>
165 |         
166 |         <f:item xmlns:f="foo">
167 |         <year>2018</year>
168 |         <f:author>Jerry</f:author>
169 |         <data>xyz</data>
170 |         </f:item>
171 | grep:
172 |     filter: grep:blue
173 |     data: |
174 |         The rose is red;
175 |         the violet's blue.
176 |         Sugar is sweet,
177 |         and so are you.
178 |     expected_result: |-
179 |         the violet's blue.
180 | grep_with_comma:
181 |     filter: grep:\054
182 |     data: |
183 |         The rose is red;
184 |         the violet's blue.
185 |         Sugar is sweet,
186 |         and so are you.
187 |     expected_result: |-
188 |         Sugar is sweet,
189 | json_format:
190 |     filter: format-json
191 |     data: |
192 |         {"field1": {"f1.1": "value"},"field2": "value"}
193 |     expected_result: |-
194 |         {
195 |             "field1": {
196 |                 "f1.1": "value"
197 |             },
198 |             "field2": "value"
199 |         }
200 | json_format_subfilter:
201 |     filter: format-json:2
202 |     data: |
203 |         {"field1": {"f1.1": "value"},"field2": "value"}
204 |     expected_result: |-
205 |         {
206 |           "field1": {
207 |             "f1.1": "value"
208 |           },
209 |           "field2": "value"
210 |         }
211 | sha1:
212 |     filter: sha1sum
213 |     data: 1234567890abcdefg
214 |     expected_result: 8417680c09644df743d7cea1366fbe13a31b2d5e
215 | hexdump:
216 |     filter: hexdump
217 |     data: |
218 |         Hello world!
219 |         你好，世界！
220 |     expected_result: |-
221 |         48 65 6c 6c 6f 20 77 6f 72 6c 64 21 0a e4 bd a0  Hello world!....
222 |         e5 a5 bd ef bc 8c e4 b8 96 e7 95 8c ef bc 81 0a  ................
223 | 


--------------------------------------------------------------------------------
/lib/urlwatch/handler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import datetime
 32 | import logging
 33 | import time
 34 | import traceback
 35 | 
 36 | from .filters import FilterBase
 37 | from .jobs import NotModifiedError
 38 | from .reporters import ReporterBase
 39 | 
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | 
 43 | class JobState(object):
 44 |     def __init__(self, cache_storage, resources, job):
 45 |         self.cache_storage = cache_storage
 46 |         self.resources = resources
 47 |         self.job = job
 48 |         self.verb = None
 49 |         self.old_data = None
 50 |         self.new_data = None
 51 |         self.history_data = {}
 52 |         self.timestamp = None
 53 |         self.exception = None
 54 |         self.traceback = None
 55 |         self.tries = 0
 56 |         self.etag = None
 57 |         self.error_ignored = False
 58 | 
 59 |     def load(self):
 60 |         guid = self.job.get_guid()
 61 |         self.old_data, self.timestamp, self.tries, self.etag = self.cache_storage.load(self.job, guid)
 62 |         if self.tries is None:
 63 |             self.tries = 0
 64 |         if self.job.compared_versions and self.job.compared_versions > 1:
 65 |             self.history_data = self.cache_storage.get_history_data(guid, self.job.compared_versions)
 66 | 
 67 |     def save(self):
 68 |         if self.new_data is None and self.exception is not None:
 69 |             # If no new data has been retrieved due to an exception, use the old job data
 70 |             self.new_data = self.old_data
 71 | 
 72 |         self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time(), self.tries, self.etag)
 73 | 
 74 |     def process(self):
 75 |         now_epoch = datetime.datetime.now().strftime('%s')
 76 |         logger.info('Processing: %s', self.job)
 77 |         try:
 78 |             try:
 79 |                 self.load()
 80 |                 data = self.job.retrieve(self)
 81 | 
 82 |                 # Apply automatic filters first
 83 |                 data = FilterBase.auto_process(self, data)
 84 | 
 85 |                 # Apply any specified filters
 86 |                 filter_list = self.job.filter
 87 | 
 88 |                 if filter_list:
 89 |                     if isinstance(filter_list, list):
 90 |                         for item in filter_list:
 91 |                             key = next(iter(item))
 92 |                             filter_kind, subfilter = key, item[key]
 93 |                             data = FilterBase.process(filter_kind, subfilter, self, data)
 94 |                     elif isinstance(filter_list, str):
 95 |                         for filter_kind in filter_list.split(','):
 96 |                             if ':' in filter_kind:
 97 |                                 filter_kind, subfilter = filter_kind.split(':', 1)
 98 |                             else:
 99 |                                 subfilter = None
100 |                             data = FilterBase.process(filter_kind, subfilter, self, data)
101 |                 self.new_data = data
102 | 
103 |             except Exception as e:
104 |                 # job has a chance to format and ignore its error
105 |                 self.exception = e
106 |                 self.traceback = self.job.format_error(e, traceback.format_exc())
107 |                 self.new_data = "ERROR|%s: %s" % (now_epoch, e)
108 |                 self.error_ignored = self.job.ignore_error(e)
109 |                 if not (self.error_ignored or isinstance(e, NotModifiedError)):
110 |                     self.tries += 1
111 |                     logger.debug('Increasing number of tries to %i for %s', self.tries, self.job)
112 |         except Exception as e:
113 |             # job failed its chance to handle error
114 |             self.exception = e
115 |             self.traceback = traceback.format_exc()
116 |             self.new_data = "ERROR|%s: %s" % (now_epoch, e)
117 |             self.error_ignored = False
118 |             if not isinstance(e, NotModifiedError):
119 |                 self.tries += 1
120 |                 logger.debug('Increasing number of tries to %i for %s', self.tries, self.job)
121 | 
122 |         return self
123 | 
124 | 
125 | class Report(object):
126 |     def __init__(self, urlwatch_config):
127 |         self.config = urlwatch_config.config_storage.config
128 | 
129 |         self.job_states = []
130 |         self.start = datetime.datetime.now()
131 | 
132 |     def _result(self, verb, job_state):
133 |         if job_state.exception is not None:
134 |             # TODO: Once we require Python >= 3.5, we can just pass in job_state.exception as "exc_info" parameter
135 |             exc_info = (type(job_state.exception), job_state.exception, job_state.exception.__traceback__)
136 |             logger.debug('Got exception while processing %r', job_state.job, exc_info=exc_info)
137 | 
138 |         job_state.verb = verb
139 |         self.job_states.append(job_state)
140 | 
141 |     def new(self, job_state):
142 |         self._result('new', job_state)
143 | 
144 |     def changed(self, job_state):
145 |         self._result('changed', job_state)
146 | 
147 |     def unchanged(self, job_state):
148 |         self._result('unchanged', job_state)
149 | 
150 |     def error(self, job_state):
151 |         self._result('error', job_state)
152 | 
153 |     def get_filtered_job_states(self, job_states):
154 |         for job_state in job_states:
155 |             if not any(job_state.verb == verb and not self.config['display'][verb]
156 |                        for verb in ('unchanged', 'new', 'error')):
157 |                 yield job_state
158 | 
159 |     def finish(self):
160 |         end = datetime.datetime.now()
161 |         duration = (end - self.start)
162 | 
163 |         ReporterBase.submit_all(self, self.job_states, duration)
164 | 


--------------------------------------------------------------------------------
/test/test_handler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from glob import glob
  3 | 
  4 | import pycodestyle as pycodestyle
  5 | from urlwatch.jobs import UrlJob, JobBase, ShellJob
  6 | from urlwatch.storage import UrlsYaml, UrlsTxt
  7 | 
  8 | from nose.tools import raises, with_setup
  9 | 
 10 | import tempfile
 11 | import os
 12 | import imp
 13 | 
 14 | from urlwatch import storage
 15 | from urlwatch.config import BaseConfig
 16 | from urlwatch.storage import YamlConfigStorage, CacheMiniDBStorage
 17 | from urlwatch.main import Urlwatch
 18 | 
 19 | 
 20 | def test_required_classattrs_in_subclasses():
 21 |     for kind, subclass in JobBase.__subclasses__.items():
 22 |         assert hasattr(subclass, '__kind__')
 23 |         assert hasattr(subclass, '__required__')
 24 |         assert hasattr(subclass, '__optional__')
 25 | 
 26 | 
 27 | def test_save_load_jobs():
 28 |     jobs = [
 29 |         UrlJob(name='news', url='http://news.orf.at/'),
 30 |         ShellJob(name='list homedir', command='ls ~'),
 31 |         ShellJob(name='list proc', command='ls /proc'),
 32 |     ]
 33 | 
 34 |     # tempfile.NamedTemporaryFile() doesn't work on Windows
 35 |     # because the returned file object cannot be opened again
 36 |     fd, name = tempfile.mkstemp()
 37 |     UrlsYaml(name).save(jobs)
 38 |     jobs2 = UrlsYaml(name).load()
 39 |     os.chmod(name, 0o777)
 40 |     jobs3 = UrlsYaml(name).load_secure()
 41 |     os.close(fd)
 42 |     os.remove(name)
 43 | 
 44 |     assert len(jobs2) == len(jobs)
 45 |     # Assert that the shell jobs have been removed due to secure loading
 46 |     if sys.platform != 'win32':
 47 |         assert len(jobs3) == 1
 48 | 
 49 | 
 50 | def test_load_config_yaml():
 51 |     config_file = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml')
 52 |     if os.path.exists(config_file):
 53 |         config = YamlConfigStorage(config_file)
 54 |         assert config is not None
 55 |         assert config.config is not None
 56 |         assert config.config == storage.DEFAULT_CONFIG
 57 | 
 58 | 
 59 | def test_load_urls_txt():
 60 |     urls_txt = os.path.join(os.path.dirname(__file__), 'data', 'urls.txt')
 61 |     if os.path.exists(urls_txt):
 62 |         assert len(UrlsTxt(urls_txt).load_secure()) > 0
 63 | 
 64 | 
 65 | def test_load_urls_yaml():
 66 |     urls_yaml = 'share/urlwatch/examples/urls.yaml.example'
 67 |     if os.path.exists(urls_yaml):
 68 |         assert len(UrlsYaml(urls_yaml).load_secure()) > 0
 69 | 
 70 | 
 71 | def test_load_hooks_py():
 72 |     hooks_py = 'share/urlwatch/examples/hooks.py.example'
 73 |     if os.path.exists(hooks_py):
 74 |         imp.load_source('hooks', hooks_py)
 75 | 
 76 | 
 77 | def test_pep8_conformance():
 78 |     """Test that we conform to PEP-8."""
 79 |     style = pycodestyle.StyleGuide(ignore=['E501', 'E402', 'W503'])
 80 | 
 81 |     py_files = [y for x in os.walk(os.path.abspath('.')) for y in glob(os.path.join(x[0], '*.py'))]
 82 |     result = style.check_files(py_files)
 83 |     assert result.total_errors == 0, "Found #{0} code style errors".format(result.total_errors)
 84 | 
 85 | 
 86 | class TestConfig(BaseConfig):
 87 |     def __init__(self, config, urls, cache, hooks, verbose):
 88 |         (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
 89 |         super().__init__('urlwatch', os.path.dirname(__file__), config, urls, cache, hooks, verbose)
 90 |         self.edit = False
 91 |         self.edit_hooks = False
 92 | 
 93 | 
 94 | def teardown_func():
 95 |     "tear down test fixtures"
 96 |     cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db')
 97 |     if os.path.exists(cache):
 98 |         os.remove(cache)
 99 | 
100 | 
101 | @with_setup(teardown=teardown_func)
102 | def test_run_watcher():
103 |     urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'share', 'urlwatch', 'examples', 'urls.yaml.example')
104 |     config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml')
105 |     cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db')
106 |     hooks = ''
107 | 
108 |     config_storage = YamlConfigStorage(config)
109 |     urls_storage = UrlsYaml(urls)
110 |     cache_storage = CacheMiniDBStorage(cache)
111 |     try:
112 |         urlwatch_config = TestConfig(config, urls, cache, hooks, True)
113 | 
114 |         urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage)
115 |         urlwatcher.run_jobs()
116 |     finally:
117 |         cache_storage.close()
118 | 
119 | 
120 | def test_unserialize_shell_job_without_kind():
121 |     job = JobBase.unserialize({
122 |         'name': 'hoho',
123 |         'command': 'ls',
124 |     })
125 |     assert isinstance(job, ShellJob)
126 | 
127 | 
128 | @raises(ValueError)
129 | def test_unserialize_with_unknown_key():
130 |     JobBase.unserialize({
131 |         'unknown_key': 123,
132 |         'name': 'hoho',
133 |     })
134 | 
135 | 
136 | def prepare_retry_test():
137 |     urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'test', 'data', 'invalid-url.yaml')
138 |     config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml')
139 |     cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db')
140 |     hooks = ''
141 | 
142 |     config_storage = YamlConfigStorage(config)
143 |     cache_storage = CacheMiniDBStorage(cache)
144 |     urls_storage = UrlsYaml(urls)
145 | 
146 |     urlwatch_config = TestConfig(config, urls, cache, hooks, True)
147 |     urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage)
148 | 
149 |     return urlwatcher, cache_storage
150 | 
151 | 
152 | @with_setup(teardown=teardown_func)
153 | def test_number_of_tries_in_cache_is_increased():
154 |     urlwatcher, cache_storage = prepare_retry_test()
155 |     try:
156 |         job = urlwatcher.jobs[0]
157 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
158 |         assert tries == 0
159 | 
160 |         urlwatcher.run_jobs()
161 |         urlwatcher.run_jobs()
162 | 
163 |         job = urlwatcher.jobs[0]
164 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
165 | 
166 |         assert tries == 2
167 |         assert urlwatcher.report.job_states[-1].verb == 'error'
168 |     finally:
169 |         cache_storage.close()
170 | 
171 | 
172 | @with_setup(teardown=teardown_func)
173 | def test_report_error_when_out_of_tries():
174 |     urlwatcher, cache_storage = prepare_retry_test()
175 |     try:
176 |         job = urlwatcher.jobs[0]
177 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
178 |         assert tries == 0
179 | 
180 |         urlwatcher.run_jobs()
181 |         urlwatcher.run_jobs()
182 | 
183 |         report = urlwatcher.report
184 |         assert report.job_states[-1].verb == 'error'
185 |     finally:
186 |         cache_storage.close()
187 | 
188 | 
189 | @with_setup(teardown=teardown_func)
190 | def test_reset_tries_to_zero_when_successful():
191 |     urlwatcher, cache_storage = prepare_retry_test()
192 |     try:
193 |         job = urlwatcher.jobs[0]
194 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
195 |         assert tries == 0
196 | 
197 |         urlwatcher.run_jobs()
198 | 
199 |         job = urlwatcher.jobs[0]
200 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
201 |         assert tries == 1
202 | 
203 |         # use an url that definitely exists
204 |         job = urlwatcher.jobs[0]
205 |         job.url = 'file://' + os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml')
206 | 
207 |         urlwatcher.run_jobs()
208 | 
209 |         job = urlwatcher.jobs[0]
210 |         old_data, timestamp, tries, etag = cache_storage.load(job, job.get_guid())
211 |         assert tries == 0
212 |     finally:
213 |         cache_storage.close()
214 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
  6 | 
  7 | ## [Unreleased]
  8 | 
  9 | ### Added
 10 | - New filter: `re.sub` that can replace/remove strings using regular expressions
 11 | 
 12 | ### Fixed
 13 | - Fix `--test-filter` when the specified job is not found
 14 | 
 15 | ### Changed
 16 | - Nicer formatting of `--features` for jobs with no docstring or many keys
 17 | 
 18 | 
 19 | ## [2.17] -- 2019-04-12
 20 | 
 21 | ### Added
 22 | - XPath/CSS: Support for excluding elements (#333, by Chenfeng Bao)
 23 | - Add support for using external `diff_tool` on Windows (#373, by Chenfeng Bao)
 24 | - Document how to use Amazon Simple E-Mail Service "SES" (by mborsetti)
 25 | - Compare data with multiple old versions (`compared_versions`, #328, by Chenfeng Bao)
 26 | 
 27 | ### Fixed
 28 | - YAML: Fix deprecation warnings (#367, by Florent Aide)
 29 | - Updated manpage with new options: Authentication, filter tests (Fixes #351)
 30 | - Text formatter: Do not emit empty lines for `line_length=0` (Fixes #357)
 31 | 
 32 | ### Changed
 33 | - SMTP configuration fix: Only use smtp.user config if it's a non-empty value
 34 | 
 35 | 
 36 | ## [2.16] -- 2019-01-27
 37 | 
 38 | ### Added
 39 | - XPath: Handle `/text()` selector (#282)
 40 | - Document how to specify cookies to README.md (#264)
 41 | - Text Reporter: `minimal` config option to only print a summary (PR#304, fixes #147)
 42 | - README.md: Document how to watch Github releases via XPath (#266)
 43 | - Support for parsing XML/RSS with XPath (Fixes #281)
 44 | - Allow explicit setting of `encoding` for URL jobs (PR#313, contributes to #306)
 45 | - Slack Channel Reporter (PR#309)
 46 | - ANSI color output on the Windows console via `colorama` (PR#296, closes #295)
 47 | - Support for using CSS selectors via the `cssselect` module (PR#321, closes 273)
 48 | - `ignore_http_error_codes` is now an option for URL jobs (PR#325, fixes #203)
 49 | - `job_defaults` in the config for globally specifying settings (PR#345, closes #253)
 50 | - Optional `timeout` (in seconds) for URL jobs to specify socket timeout (PR#348, closes #340)
 51 | 
 52 | ### Removed
 53 | - Support for JSON storage (dead code that was never used in production; PR#336)
 54 | 
 55 | ### Changed
 56 | - `HtmlReporter` now also highlights links for browser jobs (PR#303)
 57 | - Allow `--features` and `--edit-*` to run without `urls.yaml` (PR#301)
 58 | - When a previous run had errors, do not use conditional GETs (PR#313, fixes #292)
 59 | - Explicitly specify JSON pretty print `separators` for consistency (PR#343)
 60 | - Use data-driven unit tests/fixtures for easier unit test maintenance (PR#344)
 61 | 
 62 | ### Fixed
 63 | - Fix migration issues with case-insensitive filesystems (#223)
 64 | - Correctly reset retry counter when job is added or unchanged (PR#291, PR#314)
 65 | - Fix a `FutureWarning` on Python 3.7 with regard to regular expressions (PR#299)
 66 | - If the filter list is empty, do not process the filter list (PR#308)
 67 | - Fix parsing/sanity-checking of `urls.yaml` after editing (PR#317, fixes #316)
 68 | - Fix Python 3.3 compatibility by depending on `enum34` there (PR#311)
 69 | - Allow running unit tests on Windows (PR#318)
 70 | - Fix migration issues introduced by PR#180 and #256 (PR#323, fixes #267)
 71 | 
 72 | 
 73 | ## [2.15] -- 2018-10-23
 74 | 
 75 | ### Added
 76 | - Support for Mailgun regions (by Daniel Peukert, PR#280)
 77 | - CLI: Allow multiple occurences of 'filter' when adding jobs (PR#278)
 78 | 
 79 | ### Changed
 80 | - Fixed incorrect name for chat_id config in the default config (by Robin B, PR#276)
 81 | 
 82 | 
 83 | ## [2.14] -- 2018-08-30
 84 | 
 85 | ### Added
 86 | - Filter to pretty-print JSON data: `format-json` (by Niko Böckerman, PR#250)
 87 | - List active Telegram chats using `--telegram-chats` (with fixes by Georg Pichler, PR#270)
 88 | - Support for HTTP `ETag` header in URL jobs and `If-None-Match` (by Karol Babioch, PR#256)
 89 | - Support for filtering HTML using XPath expressions, with `lxml` (PR#274, Fixes #226)
 90 | - Added `install_dependencies` to `setup.py` commands for easy installing of dependencies
 91 | - Added `ignore_connection_errors` per-job configuration option (by Karol Babioch, PR#261)
 92 | 
 93 | ### Changed
 94 | - Improved code (HTTP status codes, by Karol Babioch PR#258)
 95 | - Improved documentation for setting up Telegram chat bots
 96 | - Allow multiple chats for Telegram reporting (by Georg Pichler, PR#271)
 97 | 
 98 | 
 99 | ## [2.13] -- 2018-06-03
100 | 
101 | ### Added
102 | - Support for specifying a `diff_tool` (e.g. `wdiff`) for each job (Fixes #243)
103 | - Support for testing filters via `--test-filter JOB` (Fixes #237)
104 | 
105 | ### Changed
106 | - Moved ChangeLog file to CHANGELOG.md and using Keep a Changelog format.
107 | - Force version check in `setup.py`, to exclude Python 2 (Fixes #244)
108 | - Remove default parameter from internal `html2text` module (Fixes #239)
109 | - Better error/exception reporting in `--verbose` mode (Fixes #164)
110 | 
111 | ### Removed
112 | - Old ChangeLog entries
113 | 
114 | 
115 | ## [2.12] -- 2018-06-01
116 | 
117 | ### Fixed
118 | - Bugfix: Do not 'forget' old data if an exception occurs (Fixes #242)
119 | 
120 | 
121 | ## [2.11] -- 2018-05-19
122 | 
123 | ### Fixed
124 | - Retry: Make sure `tries` is initialized to zero on load (Fixes #241)
125 | 
126 | ### Changed
127 | - html2text: Make sure the bs4 method strips HTML tags (by Louis Sautier)
128 | 
129 | 
130 | ## [2.10] -- 2018-05-17
131 | 
132 | ### Added
133 | - Browser: Add support for browser jobs using `requests-html` (Fixes #215)
134 | - Retry: Add support for optional retry count in job list (by cmichi, fixes #235)
135 | - HTTP: Add support for specifying optional headers (by Tero Mononen)
136 | 
137 | ### Changed
138 | - File editing: Fix issue when `$EDITOR` contains spaces (Fixes #220)
139 | - ChangeLog: Add versions to recent ChangeLog entries (Fixes #235)
140 | 
141 | 
142 | ## [2.9] -- 2018-03-24
143 | 
144 | ### Added
145 | - E-Mail: Add support for `--smtp-login` and document GMail SMTP usage
146 | - Pushover: Device and sound attribute (by Tobias Haupenthal)
147 | 
148 | ### Changed
149 | - XDG: Move cache file to `XDG_CACHE_DIR` (by Maxime Werlen)
150 | - Migration: Unconditionally migrate urlwatch 1.x cache dirs (Fixes #206)
151 | 
152 | ### Fixed
153 | - Cleanups: Fix out-of-date debug message, use https (by Jakub Wilk)
154 | 
155 | 
156 | ## [2.8] -- 2018-01-28
157 | 
158 | ### Changed
159 | - Documentation: Mention `appdirs` (by e-dschungel)
160 | 
161 | ### Fixed
162 | - SMTP: Fix handling of missing `user` field (by e-dschungel)
163 | - Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa)
164 | - Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen)
165 | 
166 | 
167 | ## [2.7] -- 2017-11-08
168 | 
169 | ### Added
170 | - Filtering: `style` (by gvandenbroucke), `tag` (by cmichi)
171 | - New reporter: Telegram support (by gvandenbroucke)
172 | - Paths: Add `XDG_CONFIG_DIR` support (by Jelle van der Waa)
173 | 
174 | ### Changed
175 | - ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent)
176 | - HTTP: Option to avoid 304 responses, `Content-Type` header (by Vinicius Massuchetto)
177 | - html2text: Configuration options (by Vinicius Massuchetto)
178 | 
179 | ### Fixed
180 | - Issue #127: Fix error reporting
181 | - E-Mail: Fix encodings (by Seokjin Han), Allow `user` parameter for SMTP (by Jay Sitter)
182 | 
183 | 
184 | ## [2.6] -- 2016-12-04
185 | 
186 | ### Added
187 | - New filters: `sha1sum`, `hexdump`, `element-by-class`
188 | - New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain)
189 | 
190 | ### Changed
191 | - Improved filters: `BeautifulSoup` support for `html2txt` (by lechuckcaptain)
192 | - Improved handlers: HTTP Proxy (by lechuckcaptain); support for `file://` URIs
193 | - CI Integration: Build configuration for Travis CI (by lechuckcaptain)
194 | - Consistency: Feature list is now sorted by name
195 | 
196 | ### Fixed
197 | - Issue #108: Fix creation of example files on first startup
198 | - Issue #118: Fix match filters for missing keys
199 | - Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier
200 | 
201 | 
202 | Older ChangeLog entries can be found in the
203 | [old ChangeLog file](https://github.com/thp/urlwatch/blob/2.12/ChangeLog),
204 | or with `git show 2.12:ChangeLog` on the command line.
205 | 


--------------------------------------------------------------------------------
/lib/urlwatch/jobs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import email.utils
 32 | import hashlib
 33 | import logging
 34 | import os
 35 | import re
 36 | import subprocess
 37 | import threading
 38 | import requests
 39 | import textwrap
 40 | import urlwatch
 41 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
 42 | 
 43 | from .util import TrackSubClasses
 44 | 
 45 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 46 | 
 47 | logger = logging.getLogger(__name__)
 48 | 
 49 | 
 50 | class ShellError(Exception):
 51 |     """Exception for shell commands with non-zero exit code"""
 52 | 
 53 |     def __init__(self, result):
 54 |         Exception.__init__(self)
 55 |         self.result = result
 56 | 
 57 |     def __str__(self):
 58 |         return '%s: Exit status %d' % (self.__class__.__name__, self.result)
 59 | 
 60 | 
 61 | class NotModifiedError(Exception):
 62 |     """Exception raised on HTTP 304 responses"""
 63 |     ...
 64 | 
 65 | 
 66 | class JobBase(object, metaclass=TrackSubClasses):
 67 |     __subclasses__ = {}
 68 | 
 69 |     __required__ = ()
 70 |     __optional__ = ()
 71 | 
 72 |     def __init__(self, **kwargs):
 73 |         # Set optional keys to None
 74 |         for k in self.__optional__:
 75 |             if k not in kwargs:
 76 |                 setattr(self, k, None)
 77 | 
 78 |         # Fail if any required keys are not provided
 79 |         for k in self.__required__:
 80 |             if k not in kwargs:
 81 |                 raise ValueError('Required field %s missing: %r' % (k, kwargs))
 82 | 
 83 |         for k, v in list(kwargs.items()):
 84 |             setattr(self, k, v)
 85 | 
 86 |     @classmethod
 87 |     def job_documentation(cls):
 88 |         result = []
 89 |         for sc in TrackSubClasses.sorted_by_kind(cls):
 90 |             if sc.__doc__:
 91 |                 result.append('  * %s - %s' % (sc.__kind__, sc.__doc__))
 92 |             else:
 93 |                 result.append('  * %s' % (sc.__kind__,))
 94 | 
 95 |             for msg, value in (('    Required keys: ', sc.__required__), ('    Optional keys: ', sc.__optional__)):
 96 |                 if value:
 97 |                     values = ('\n' + (len(msg) * ' ')).join(textwrap.wrap(', '.join(value), 79 - len(msg)))
 98 |                     result.append('%s%s' % (msg, values))
 99 |             result.append('')
100 |         return '\n'.join(result)
101 | 
102 |     def get_location(self):
103 |         raise NotImplementedError()
104 | 
105 |     def pretty_name(self):
106 |         raise NotImplementedError()
107 | 
108 |     def serialize(self):
109 |         d = {'kind': self.__kind__}
110 |         d.update(self.to_dict())
111 |         return d
112 | 
113 |     @classmethod
114 |     def unserialize(cls, data):
115 |         if 'kind' not in data:
116 |             # Try to auto-detect the kind of job based on the available keys
117 |             kinds = [subclass.__kind__ for subclass in list(cls.__subclasses__.values())
118 |                      if all(required in data for required in subclass.__required__) and not any(
119 |                      key not in subclass.__required__ and key not in subclass.__optional__ for key in data)]
120 | 
121 |             if len(kinds) == 1:
122 |                 kind = kinds[0]
123 |             elif len(kinds) == 0:
124 |                 raise ValueError('Kind is not specified, and no job matches: %r' % (data,))
125 |             else:
126 |                 raise ValueError('Multiple kinds of jobs match %r: %r' % (data, kinds))
127 |         else:
128 |             kind = data['kind']
129 | 
130 |         return cls.__subclasses__[kind].from_dict(data)
131 | 
132 |     def to_dict(self):
133 |         return {k: getattr(self, k) for keys in (self.__required__, self.__optional__) for k in keys
134 |                 if getattr(self, k) is not None}
135 | 
136 |     @classmethod
137 |     def from_dict(cls, data):
138 |         return cls(**{k: v for k, v in list(data.items()) if k in cls.__required__ or k in cls.__optional__})
139 | 
140 |     def __repr__(self):
141 |         return '<%s %s>' % (self.__kind__, ' '.join('%s=%r' % (k, v) for k, v in list(self.to_dict().items())))
142 | 
143 |     def _set_defaults(self, defaults):
144 |         if isinstance(defaults, dict):
145 |             for key, value in defaults.items():
146 |                 if key in self.__optional__ and getattr(self, key) is None:
147 |                     setattr(self, key, value)
148 | 
149 |     def with_defaults(self, config):
150 |         new_job = JobBase.unserialize(self.serialize())
151 |         cfg = config.get('job_defaults')
152 |         if isinstance(cfg, dict):
153 |             new_job._set_defaults(cfg.get(self.__kind__))
154 |             new_job._set_defaults(cfg.get('all'))
155 |         return new_job
156 | 
157 |     def get_guid(self):
158 |         location = self.get_location()
159 |         sha_hash = hashlib.new('sha1')
160 |         sha_hash.update(location.encode('utf-8'))
161 |         return sha_hash.hexdigest()
162 | 
163 |     def request_resources(self, resources):
164 |         """Request external resources.
165 | 
166 |         Check if required resources is available in `resources` (a dict).
167 |         If not, request resources and save them in `resources`. Keys for
168 |         resources should typically be `self` or class, depending on the
169 |         shared or exclusive nature of the resource.
170 | 
171 |         This method should be called sequentially on the main thread to
172 |         ensure success.
173 | 
174 |         Args:
175 |             resources: A dict storing external resources.
176 |         """
177 |         ...
178 | 
179 |     def release_resources(self, resources):
180 |         """Release external resources requested in `request_resources`."""
181 |         ...
182 | 
183 |     def retrieve(self, job_state):
184 |         raise NotImplementedError()
185 | 
186 |     def format_error(self, exception, tb):
187 |         return tb
188 | 
189 |     def ignore_error(self, exception):
190 |         return False
191 | 
192 | 
193 | class Job(JobBase):
194 |     __required__ = ()
195 |     __optional__ = ('name', 'filter', 'max_tries', 'diff_tool', 'compared_versions')
196 | 
197 |     # determine if hyperlink "a" tag is used in HtmlReporter
198 |     LOCATION_IS_URL = False
199 | 
200 |     def pretty_name(self):
201 |         return self.name if self.name else self.get_location()
202 | 
203 | 
204 | class ShellJob(Job):
205 |     """Run a shell command and get its standard output"""
206 | 
207 |     __kind__ = 'shell'
208 | 
209 |     __required__ = ('command',)
210 |     __optional__ = ()
211 | 
212 |     def get_location(self):
213 |         return self.command
214 | 
215 |     def retrieve(self, job_state):
216 |         process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True)
217 |         stdout_data, stderr_data = process.communicate()
218 |         result = process.wait()
219 |         if result != 0:
220 |             raise ShellError(result)
221 | 
222 |         return stdout_data.decode('utf-8')
223 | 
224 | 
225 | class UrlJob(Job):
226 |     """Retrieve an URL from a web server"""
227 | 
228 |     __kind__ = 'url'
229 | 
230 |     __required__ = ('url',)
231 |     __optional__ = ('cookies', 'data', 'method', 'ssl_no_verify', 'ignore_cached', 'http_proxy', 'https_proxy',
232 |                     'headers', 'ignore_connection_errors', 'ignore_http_error_codes', 'encoding', 'timeout',
233 |                     'ignore_timeout_errors', 'ignore_too_many_redirects')
234 | 
235 |     LOCATION_IS_URL = True
236 |     CHARSET_RE = re.compile('text/(html|plain); charset=([^;]*)')
237 | 
238 |     def get_location(self):
239 |         return self.url
240 | 
241 |     def retrieve(self, job_state):
242 |         headers = {
243 |             'User-agent': urlwatch.__user_agent__,
244 |         }
245 | 
246 |         proxies = {
247 |             'http': os.getenv('HTTP_PROXY'),
248 |             'https': os.getenv('HTTPS_PROXY'),
249 |         }
250 | 
251 |         if job_state.etag is not None:
252 |             headers['If-None-Match'] = job_state.etag
253 | 
254 |         if job_state.timestamp is not None:
255 |             headers['If-Modified-Since'] = email.utils.formatdate(job_state.timestamp)
256 | 
257 |         if self.ignore_cached or job_state.tries > 0:
258 |             headers['If-None-Match'] = None
259 |             headers['If-Modified-Since'] = email.utils.formatdate(0)
260 |             headers['Cache-Control'] = 'max-age=172800'
261 |             headers['Expires'] = email.utils.formatdate()
262 | 
263 |         if self.method is None:
264 |             self.method = "GET"
265 |         if self.data is not None:
266 |             self.method = "POST"
267 |             headers['Content-type'] = 'application/x-www-form-urlencoded'
268 |             logger.info('Sending POST request to %s', self.url)
269 | 
270 |         if self.http_proxy is not None:
271 |             proxies['http'] = self.http_proxy
272 |         if self.https_proxy is not None:
273 |             proxies['https'] = self.https_proxy
274 | 
275 |         file_scheme = 'file://'
276 |         if self.url.startswith(file_scheme):
277 |             logger.info('Using local filesystem (%s URI scheme)', file_scheme)
278 |             return open(self.url[len(file_scheme):], 'rt').read()
279 | 
280 |         if self.headers:
281 |             self.add_custom_headers(headers)
282 | 
283 |         if self.timeout is None:
284 |             # default timeout
285 |             timeout = 60
286 |         elif self.timeout == 0:
287 |             # never timeout
288 |             timeout = None
289 |         else:
290 |             timeout = self.timeout
291 | 
292 |         response = requests.request(url=self.url,
293 |                                     data=self.data,
294 |                                     headers=headers,
295 |                                     method=self.method,
296 |                                     verify=(not self.ssl_no_verify),
297 |                                     cookies=self.cookies,
298 |                                     proxies=proxies,
299 |                                     timeout=timeout)
300 | 
301 |         response.raise_for_status()
302 |         if response.status_code == requests.codes.not_modified:
303 |             raise NotModifiedError()
304 | 
305 |         # Save ETag from response into job_state, which will be saved in cache
306 |         job_state.etag = response.headers.get('ETag')
307 | 
308 |         # If we're doing OCR, return the request content directly
309 |         if self.filter is not None and 'ocr' in self.filter:
310 |             return response.content
311 | 
312 |         # If we can't find the encoding in the headers, requests gets all
313 |         # old-RFC-y and assumes ISO-8859-1 instead of UTF-8. Use the old
314 |         # urlwatch behavior and try UTF-8 decoding first.
315 |         content_type = response.headers.get('Content-type', '')
316 |         content_type_match = self.CHARSET_RE.match(content_type)
317 |         if not content_type_match and not self.encoding:
318 |             try:
319 |                 try:
320 |                     try:
321 |                         return response.content.decode('utf-8')
322 |                     except UnicodeDecodeError:
323 |                         return response.content.decode('latin1')
324 |                 except UnicodeDecodeError:
325 |                     return response.content.decode('utf-8', 'ignore')
326 |             except LookupError:
327 |                 # If this is an invalid encoding, decode as ascii (Debian bug 731931)
328 |                 return response.content.decode('ascii', 'ignore')
329 |         if self.encoding:
330 |             response.encoding = self.encoding
331 | 
332 |         return response.text
333 | 
334 |     def add_custom_headers(self, headers):
335 |         """
336 |         Adds custom request headers from the job list (URLs) to the pre-filled dictionary `headers`.
337 |         Pre-filled values of conflicting header keys (case-insensitive) are overwritten by custom value.
338 |         """
339 |         headers_to_remove = [x for x in headers if x.lower() in [y.lower() for y in self.headers]]
340 |         for header in headers_to_remove:
341 |             headers.pop(header, None)
342 |         headers.update(self.headers)
343 | 
344 |     def format_error(self, exception, tb):
345 |         if isinstance(exception, requests.exceptions.RequestException):
346 |             # Instead of a full traceback, just show the HTTP error
347 |             return str(exception)
348 |         return tb
349 | 
350 |     def ignore_error(self, exception):
351 |         if isinstance(exception, requests.exceptions.ConnectionError) and self.ignore_connection_errors:
352 |             return True
353 |         if isinstance(exception, requests.exceptions.Timeout) and self.ignore_timeout_errors:
354 |             return True
355 |         if isinstance(exception, requests.exceptions.TooManyRedirects) and self.ignore_too_many_redirects:
356 |             return True
357 |         elif isinstance(exception, requests.exceptions.HTTPError):
358 |             status_code = exception.response.status_code
359 |             ignored_codes = []
360 |             if isinstance(self.ignore_http_error_codes, int) and self.ignore_http_error_codes == status_code:
361 |                 return True
362 |             elif isinstance(self.ignore_http_error_codes, str):
363 |                 ignored_codes = [s.strip().lower() for s in self.ignore_http_error_codes.split(',')]
364 |             elif isinstance(self.ignore_http_error_codes, list):
365 |                 ignored_codes = [str(s).strip().lower() for s in self.ignore_http_error_codes]
366 |             return str(status_code) in ignored_codes or '%sxx' % (status_code // 100) in ignored_codes
367 |         return False
368 | 
369 | 
370 | class BrowserJob(Job):
371 |     """Retrieve an URL, emulating a real web browser"""
372 | 
373 |     __kind__ = 'browser'
374 | 
375 |     __required__ = ('navigate',)
376 |     __optional__ = ('options',)
377 | 
378 |     LOCATION_IS_URL = True
379 | 
380 |     def get_location(self):
381 |         return self.navigate
382 | 
383 |     def request_resources(self, resources):
384 |         import asyncio
385 |         import pyppeteer
386 | 
387 |         @asyncio.coroutine
388 |         def _launch_browser():
389 |             browser = yield from pyppeteer.launch()
390 |             for p in (yield from browser.pages()):
391 |                 yield from p.close()
392 |             return browser
393 | 
394 |         if BrowserJob not in resources:
395 |             event_loop = asyncio.new_event_loop()
396 |             browser = event_loop.run_until_complete(_launch_browser())
397 |             loop_thread = threading.Thread(target=event_loop.run_forever)
398 |             loop_thread.start()
399 |             resources[BrowserJob] = {
400 |                 'event_loop': event_loop,
401 |                 'browser': browser,
402 |                 'loop_thread': loop_thread
403 |             }
404 | 
405 |     def release_resources(self, resources):
406 |         import asyncio
407 |         res = resources.get(BrowserJob)
408 |         if res is not None:
409 |             event_loop = res['event_loop']
410 |             browser = res['browser']
411 |             loop_thread = res['loop_thread']
412 |             event_loop.call_soon_threadsafe(event_loop.stop)
413 |             loop_thread.join()
414 |             event_loop.run_until_complete(browser.close())
415 |             del resources[BrowserJob]
416 | 
417 |     def retrieve(self, job_state):
418 |         import asyncio
419 | 
420 |         @asyncio.coroutine
421 |         def _get_content(browser):
422 |             context = yield from browser.createIncognitoBrowserContext()
423 |             page = yield from context.newPage()
424 |             yield from page.goto(self.navigate, options=self.options)
425 |             content = yield from page.content()
426 |             yield from context.close()
427 |             return content
428 | 
429 |         event_loop = job_state.resources[BrowserJob]['event_loop']
430 |         browser = job_state.resources[BrowserJob]['browser']
431 |         return asyncio.run_coroutine_threadsafe(_get_content(browser), event_loop).result()
432 | 


--------------------------------------------------------------------------------
/lib/urlwatch/command.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import imp
 32 | import logging
 33 | import os
 34 | import shutil
 35 | import sys
 36 | import requests
 37 | import email.utils
 38 | 
 39 | from .filters import FilterBase
 40 | from .handler import JobState
 41 | from .jobs import JobBase, UrlJob
 42 | from .reporters import ReporterBase
 43 | from .util import atomic_rename, edit_file
 44 | from .mailer import set_password, have_password
 45 | 
 46 | logger = logging.getLogger(__name__)
 47 | 
 48 | 
 49 | class UrlwatchCommand:
 50 |     def __init__(self, urlwatcher):
 51 | 
 52 |         self.urlwatcher = urlwatcher
 53 |         self.urlwatch_config = urlwatcher.urlwatch_config
 54 | 
 55 |     def edit_hooks(self):
 56 |         fn_base, fn_ext = os.path.splitext(self.urlwatch_config.hooks)
 57 |         hooks_edit = fn_base + '.edit' + fn_ext
 58 |         try:
 59 |             if os.path.exists(self.urlwatch_config.hooks):
 60 |                 shutil.copy(self.urlwatch_config.hooks, hooks_edit)
 61 |             elif self.urlwatch_config.hooks_py_example is not None and os.path.exists(
 62 |                     self.urlwatch_config.hooks_py_example):
 63 |                 shutil.copy(self.urlwatch_config.hooks_py_example, hooks_edit)
 64 |             edit_file(hooks_edit)
 65 |             imp.load_source('hooks', hooks_edit)
 66 |             atomic_rename(hooks_edit, self.urlwatch_config.hooks)
 67 |             print('Saving edit changes in', self.urlwatch_config.hooks)
 68 |         except SystemExit:
 69 |             raise
 70 |         except Exception as e:
 71 |             print('Parsing failed:')
 72 |             print('======')
 73 |             print(e)
 74 |             print('======')
 75 |             print('')
 76 |             print('The file', self.urlwatch_config.hooks, 'was NOT updated.')
 77 |             print('Your changes have been saved in', hooks_edit)
 78 |             return 1
 79 | 
 80 |         return 0
 81 | 
 82 |     def show_features(self):
 83 |         print()
 84 |         print('Supported jobs:\n')
 85 |         print(JobBase.job_documentation())
 86 | 
 87 |         print('Supported filters:\n')
 88 |         print(FilterBase.filter_documentation())
 89 |         print()
 90 |         print('Supported reporters:\n')
 91 |         print(ReporterBase.reporter_documentation())
 92 |         print()
 93 |         return 0
 94 | 
 95 |     def list_urls(self):
 96 |         for idx, job in enumerate(self.urlwatcher.jobs):
 97 |             if self.urlwatch_config.verbose:
 98 |                 print('%d: %s' % (idx + 1, repr(job)))
 99 |             else:
100 |                 pretty_name = job.pretty_name()
101 |                 location = job.get_location()
102 |                 if pretty_name != location:
103 |                     print('%d: %s ( %s )' % (idx + 1, pretty_name, location))
104 |                 else:
105 |                     print('%d: %s' % (idx + 1, pretty_name))
106 |         return 0
107 | 
108 |     def _find_job(self, query):
109 |         try:
110 |             index = int(query)
111 |             if index <= 0:
112 |                 return None
113 |             try:
114 |                 return self.urlwatcher.jobs[index - 1]
115 |             except IndexError:
116 |                 return None
117 |         except ValueError:
118 |             return next((job for job in self.urlwatcher.jobs if job.get_location() == query), None)
119 | 
120 |     def test_filter(self):
121 |         job = self._find_job(self.urlwatch_config.test_filter)
122 |         if job is None:
123 |             print('Not found: %r' % (self.urlwatch_config.test_filter,))
124 |             return 1
125 |         job = job.with_defaults(self.urlwatcher.config_storage.config)
126 | 
127 |         if isinstance(job, UrlJob):
128 |             # Force re-retrieval of job, as we're testing filters
129 |             job.ignore_cached = True
130 | 
131 |         resources = {}
132 |         job.request_resources(resources)
133 |         job_state = JobState(self.urlwatcher.cache_storage, resources, job)
134 |         job_state.process()
135 |         if job_state.exception is not None:
136 |             raise job_state.exception
137 |         print(job_state.new_data)
138 |         # We do not save the job state or job on purpose here, since we are possibly modifying the job
139 |         # (ignore_cached) and we do not want to store the newly-retrieved data yet (filter testing)
140 |         job.release_resources(resources)
141 |         return 0
142 | 
143 |     def modify_urls(self):
144 |         save = True
145 |         if self.urlwatch_config.delete is not None:
146 |             job = self._find_job(self.urlwatch_config.delete)
147 |             if job is not None:
148 |                 self.urlwatcher.jobs.remove(job)
149 |                 print('Removed %r' % (job,))
150 |             else:
151 |                 print('Not found: %r' % (self.urlwatch_config.delete,))
152 |                 save = False
153 | 
154 |         if self.urlwatch_config.add is not None:
155 |             # Allow multiple specifications of filter=, so that multiple filters can be specified on the CLI
156 |             items = [item.split('=', 1) for item in self.urlwatch_config.add.split(',')]
157 |             filters = [v for k, v in items if k == 'filter']
158 |             items = [(k, v) for k, v in items if k != 'filter']
159 |             d = {k: v for k, v in items}
160 |             if filters:
161 |                 d['filter'] = ','.join(filters)
162 | 
163 |             job = JobBase.unserialize(d)
164 |             print('Adding %r' % (job,))
165 |             self.urlwatcher.jobs.append(job)
166 | 
167 |         if save:
168 |             self.urlwatcher.urls_storage.save(self.urlwatcher.jobs)
169 | 
170 |         return 0
171 | 
172 |     def handle_actions(self):
173 |         if self.urlwatch_config.features:
174 |             sys.exit(self.show_features())
175 |         if self.urlwatch_config.gc_cache:
176 |             self.urlwatcher.cache_storage.gc([job.get_guid() for job in self.urlwatcher.jobs])
177 |             sys.exit(0)
178 |         if self.urlwatch_config.edit:
179 |             sys.exit(self.urlwatcher.urls_storage.edit(self.urlwatch_config.urls_yaml_example))
180 |         if self.urlwatch_config.edit_hooks:
181 |             sys.exit(self.edit_hooks())
182 |         if self.urlwatch_config.test_filter:
183 |             sys.exit(self.test_filter())
184 |         if self.urlwatch_config.list:
185 |             sys.exit(self.list_urls())
186 |         if self.urlwatch_config.report_timestamps:
187 |             sys.exit(self.report_timestamps())
188 |         if self.urlwatch_config.add is not None or self.urlwatch_config.delete is not None:
189 |             sys.exit(self.modify_urls())
190 | 
191 |     def check_edit_config(self):
192 |         if self.urlwatch_config.edit_config:
193 |             sys.exit(self.urlwatcher.config_storage.edit())
194 | 
195 |     def check_telegram_chats(self):
196 |         if self.urlwatch_config.telegram_chats:
197 |             config = self.urlwatcher.config_storage.config['report'].get('telegram', None)
198 |             if not config:
199 |                 print('You need to configure telegram in your config first (see README.md)')
200 |                 sys.exit(1)
201 | 
202 |             bot_token = config.get('bot_token', None)
203 |             if not bot_token:
204 |                 print('You need to set up your bot token first (see README.md)')
205 |                 sys.exit(1)
206 | 
207 |             info = requests.get('https://api.telegram.org/bot{}/getMe'.format(bot_token)).json()
208 | 
209 |             chats = {}
210 |             for chat_info in requests.get('https://api.telegram.org/bot{}/getUpdates'.format(bot_token)).json()['result']:
211 |                 chat = chat_info['message']['chat']
212 |                 if chat['type'] == 'private':
213 |                     chats[str(chat['id'])] = ' '.join((chat['first_name'], chat['last_name'])) if 'last_name' in chat else chat['first_name']
214 | 
215 |             if not chats:
216 |                 print('No chats found. Say hello to your bot at https://t.me/{}'.format(info['result']['username']))
217 |                 sys.exit(1)
218 | 
219 |             headers = ('Chat ID', 'Name')
220 |             maxchat = max(len(headers[0]), max((len(k) for k, v in chats.items()), default=0))
221 |             maxname = max(len(headers[1]), max((len(v) for k, v in chats.items()), default=0))
222 |             fmt = '%-' + str(maxchat) + 's  %s'
223 |             print(fmt % headers)
224 |             print(fmt % ('-' * maxchat, '-' * maxname))
225 |             for k, v in sorted(chats.items(), key=lambda kv: kv[1]):
226 |                 print(fmt % (k, v))
227 |             print('\nChat up your bot here: https://t.me/{}'.format(info['result']['username']))
228 |             sys.exit(0)
229 | 
230 |     def check_test_slack(self):
231 |         if self.urlwatch_config.test_slack:
232 |             config = self.urlwatcher.config_storage.config['report'].get('slack', None)
233 |             if not config:
234 |                 print('You need to configure slack in your config first (see README.md)')
235 |                 sys.exit(1)
236 | 
237 |             webhook_url = config.get('webhook_url', None)
238 |             if not webhook_url:
239 |                 print('You need to set up your slack webhook_url first (see README.md)')
240 |                 sys.exit(1)
241 | 
242 |             info = requests.post(webhook_url, json={"text": "Test message from urlwatch, your configuration is working"})
243 |             if info.status_code == requests.codes.ok:
244 |                 print('Successfully sent message to Slack')
245 |                 sys.exit(0)
246 |             else:
247 |                 print('Error while submitting message to Slack:{0}'.format(info.text))
248 |                 sys.exit(1)
249 | 
250 |     def check_smtp_login(self):
251 |         if self.urlwatch_config.smtp_login:
252 |             config = self.urlwatcher.config_storage.config['report']['email']
253 |             smtp_config = config['smtp']
254 | 
255 |             success = True
256 | 
257 |             if not config['enabled']:
258 |                 print('Please enable e-mail reporting in the config first.')
259 |                 success = False
260 | 
261 |             if config['method'] != 'smtp':
262 |                 print('Please set the method to SMTP for the e-mail reporter.')
263 |                 success = False
264 | 
265 |             if not smtp_config.get('auth', smtp_config.get('keyring', False)):
266 |                 print('Authentication must be enabled for SMTP.')
267 |                 success = False
268 | 
269 |             smtp_hostname = smtp_config['host']
270 |             if not smtp_hostname:
271 |                 print('Please configure the SMTP hostname in the config first.')
272 |                 success = False
273 | 
274 |             smtp_username = smtp_config.get('user', None) or config['from']
275 |             if not smtp_username:
276 |                 print('Please configure the SMTP user in the config first.')
277 |                 success = False
278 | 
279 |             if not success:
280 |                 sys.exit(1)
281 | 
282 |             if 'insecure_password' in smtp_config:
283 |                 print('The password is already set in the config (key "insecure_password").')
284 |                 sys.exit(0)
285 | 
286 |             if have_password(smtp_hostname, smtp_username):
287 |                 message = 'Password for %s / %s already set, update? [y/N] ' % (smtp_username, smtp_hostname)
288 |                 if input(message).lower() != 'y':
289 |                     print('Password unchanged.')
290 |                     sys.exit(0)
291 | 
292 |             if success:
293 |                 set_password(smtp_hostname, smtp_username)
294 |                 # TODO: Actually verify that the login to the server works
295 | 
296 |             sys.exit(0)
297 | 
298 |     # for COVID19tracker, produce a sorted report of each entry in the storage database along with its
299 |     # last-updated timestamp and push it to a webhook. running_after_jobs is false when invoked with
300 |     # --report-timestamps, and true if invoked automatically after regular jobs are run
301 |     def report_timestamps(self, running_after_jobs = False):
302 |         states = {}
303 |         for idx, job in enumerate(self.urlwatcher.jobs):
304 |             pretty_name = job.pretty_name()
305 |             guid = job.get_guid()
306 |             job_data = self.urlwatcher.cache_storage.load(None, guid)
307 |             job_data_content = job_data[0]
308 |             timestamp = job_data[1]
309 |             states[pretty_name] = {'content': '', 'timestamp': timestamp,
310 |                                    'error': ("error" in job_data_content.lower())}
311 | 
312 |             # define strings to use to determine error messaging
313 |             css_xpath_errors = ['cssfilter', 'xpathfilter']
314 |             loading_errors = ['read timed out', 'connection reset', 'timed out', 'server error', '503', '500', '522', '424', '401', 'timeouterror', 'failed to establish a new connection', 'max retries exceeded']
315 |             not_found_errors = ['not found for url', '404']
316 | 
317 |             error_content = ''
318 |             # check to see if an error was picked up
319 |             if states[pretty_name]['error']:
320 |                 # check for css_xpath_errors strings
321 |                 if any(substring in job_data_content.lower() for substring in css_xpath_errors):
322 |                     error_content = 'Current filter rules are not working.'
323 | 
324 |                 # check for various loading errors
325 |                 if any(substring in job_data_content.lower() for substring in loading_errors):
326 |                     error_content = 'Error loading URL. May be down.'
327 | 
328 |                 # check for 404 specific
329 |                 if any(substring in job_data_content.lower() for substring in not_found_errors):
330 |                     error_content = 'URL not found anymore.'
331 | 
332 |                 # did not match any checks
333 |                 if not error_content:
334 |                     error_content = 'Unknown error.'
335 | 
336 |                 # update content
337 |                 states[pretty_name].update({'content': error_content})
338 | 
339 |         states = sorted(states.items(), key=lambda x: x[1]['timestamp'], reverse=True)
340 |         output = ""
341 |         for state in states:
342 |             timestamp_formatted = email.utils.formatdate(state[1]['timestamp'], localtime=1)
343 |             output += ("%s|%s|%s\n" % (state[0], state[1]['timestamp'], state[1]['content']))
344 | 
345 |         if not running_after_jobs:
346 |             print(output)
347 | 
348 |         config = self.urlwatcher.config_storage.config['report'].get('timestamp_webhook', None)
349 |         if not config:
350 |             print('You need to configure timestamp_webhook in your config first')
351 |             sys.exit(1)
352 |         if not config['enabled']:
353 |             print('Timestamp reporting not enabled')
354 | 
355 |         webhook_url = config.get('webhook_url', None)
356 |         post_data = {'value1': output}
357 |         result = requests.post(webhook_url, json=post_data)
358 | 
359 |         try:
360 |             if result.status_code == requests.codes.ok:
361 |                 logger.info("Webhook response: ok")
362 |             else:
363 |                 logger.error("Webhook error: {0}".format(result.text))
364 |         except ValueError:
365 |             logger.error(
366 |                 "Failed to parse webook response. HTTP status code: {0}, content: {1}".format(result.status_code,
367 |                                                                                              result.content))
368 |         if not running_after_jobs:
369 |             sys.exit(0)
370 | 
371 |     def run(self):
372 |         self.check_edit_config()
373 |         self.check_smtp_login()
374 |         self.check_telegram_chats()
375 |         self.check_test_slack()
376 |         self.handle_actions()
377 |         self.urlwatcher.run_jobs()
378 | 
379 |         # if timestamp reporting is enabled,
380 |         config = self.urlwatcher.config_storage.config['report'].get('timestamp_webhook', None)
381 |         if config['enabled']:
382 |           self.report_timestamps(running_after_jobs = True);
383 | 
384 |         self.urlwatcher.close()
385 | 


--------------------------------------------------------------------------------
/lib/urlwatch/storage.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import os
 32 | import stat
 33 | import copy
 34 | import platform
 35 | from abc import ABCMeta, abstractmethod
 36 | 
 37 | import shutil
 38 | import yaml
 39 | import minidb
 40 | import logging
 41 | 
 42 | from .util import atomic_rename, edit_file
 43 | from .jobs import JobBase, UrlJob, ShellJob
 44 | 
 45 | logger = logging.getLogger(__name__)
 46 | 
 47 | DEFAULT_CONFIG = {
 48 |     'display': {
 49 |         'new': True,
 50 |         'error': True,
 51 |         'unchanged': False,
 52 |     },
 53 | 
 54 |     'report': {
 55 |         'text': {
 56 |             'line_length': 75,
 57 |             'details': True,
 58 |             'footer': True,
 59 |             'minimal': False,
 60 |         },
 61 | 
 62 |         'html': {
 63 |             'diff': 'unified',  # "unified" or "table"
 64 |         },
 65 | 
 66 |         'stdout': {
 67 |             'enabled': True,
 68 |             'color': True,
 69 |         },
 70 | 
 71 |         'email': {
 72 |             'enabled': False,
 73 | 
 74 |             'html': False,
 75 |             'to': '',
 76 |             'from': '',
 77 |             'subject': '{count} changes: {jobs}',
 78 |             'method': 'smtp',
 79 |             'smtp': {
 80 |                 'host': 'localhost',
 81 |                 'user': '',
 82 |                 'port': 25,
 83 |                 'starttls': True,
 84 |                 'auth': True,
 85 |             },
 86 |             'sendmail': {
 87 |                 'path': 'sendmail',
 88 |             }
 89 |         },
 90 |         'pushover': {
 91 |             'enabled': False,
 92 |             'app': '',
 93 |             'device': None,
 94 |             'sound': 'spacealarm',
 95 |             'user': '',
 96 |         },
 97 |         'pushbullet': {
 98 |             'enabled': False,
 99 |             'api_key': '',
100 |         },
101 |         'telegram': {
102 |             'enabled': False,
103 |             'bot_token': '',
104 |             'chat_id': '',
105 |         },
106 |         'slack': {
107 |             'enabled': False,
108 |             'webhook_url': '',
109 |         },
110 |         'webhook': {
111 |             'enabled': False,
112 |             'webhook_url': '',
113 |         },
114 |         'timestamp_webhook': {
115 |             'enabled': False,
116 |             'webhook_url': '',
117 |         },
118 |         'mailgun': {
119 |             'enabled': False,
120 |             'region': 'us',
121 |             'api_key': '',
122 |             'domain': '',
123 |             'from_mail': '',
124 |             'from_name': '',
125 |             'to': '',
126 |             'subject': '{count} changes: {jobs}'
127 |         },
128 |     },
129 | 
130 |     'job_defaults': {
131 |         'all': {},
132 |         'shell': {},
133 |         'url': {},
134 |         'browser': {}
135 |     }
136 | }
137 | 
138 | 
139 | def merge(source, destination):
140 |     # http://stackoverflow.com/a/20666342
141 |     for key, value in source.items():
142 |         if isinstance(value, dict):
143 |             # get node or create one
144 |             node = destination.setdefault(key, {})
145 |             merge(value, node)
146 |         else:
147 |             destination[key] = value
148 | 
149 |     return destination
150 | 
151 | 
152 | def get_current_user():
153 |     try:
154 |         return os.getlogin()
155 |     except OSError:
156 |         # If there is no controlling terminal, because urlwatch is launched by
157 |         # cron, or by a systemd.service for example, os.getlogin() fails with:
158 |         # OSError: [Errno 25] Inappropriate ioctl for device
159 |         import pwd
160 |         return pwd.getpwuid(os.getuid()).pw_name
161 | 
162 | 
163 | class BaseStorage(metaclass=ABCMeta):
164 |     @abstractmethod
165 |     def load(self, *args):
166 |         ...
167 | 
168 |     @abstractmethod
169 |     def save(self, *args):
170 |         ...
171 | 
172 | 
173 | class BaseFileStorage(BaseStorage, metaclass=ABCMeta):
174 |     def __init__(self, filename):
175 |         self.filename = filename
176 | 
177 | 
178 | class BaseTextualFileStorage(BaseFileStorage, metaclass=ABCMeta):
179 |     def __init__(self, filename):
180 |         super().__init__(filename)
181 |         self.config = {}
182 |         self.load()
183 | 
184 |     @classmethod
185 |     @abstractmethod
186 |     def parse(cls, *args):
187 |         ...
188 | 
189 |     def edit(self, example_file=None):
190 |         fn_base, fn_ext = os.path.splitext(self.filename)
191 |         file_edit = fn_base + '.edit' + fn_ext
192 | 
193 |         if os.path.exists(self.filename):
194 |             shutil.copy(self.filename, file_edit)
195 |         elif example_file is not None and os.path.exists(example_file):
196 |             shutil.copy(example_file, file_edit)
197 | 
198 |         while True:
199 |             try:
200 |                 edit_file(file_edit)
201 |                 # Check if we can still parse it
202 |                 if self.parse is not None:
203 |                     self.parse(file_edit)
204 |                 break  # stop if no exception on parser
205 |             except SystemExit:
206 |                 raise
207 |             except Exception as e:
208 |                 print('Parsing failed:')
209 |                 print('======')
210 |                 print(e)
211 |                 print('======')
212 |                 print('')
213 |                 print('The file', file_edit, 'was NOT updated.')
214 |                 user_input = input("Do you want to retry the same edit? (y/n)")
215 |                 if user_input.lower()[0] == 'y':
216 |                     continue
217 |                 print('Your changes have been saved in', file_edit)
218 |                 return 1
219 | 
220 |         atomic_rename(file_edit, self.filename)
221 |         print('Saving edit changes in', self.filename)
222 |         return 0
223 | 
224 |     @classmethod
225 |     def write_default_config(cls, filename):
226 |         config_storage = cls(None)
227 |         config_storage.filename = filename
228 |         config_storage.save()
229 | 
230 | 
231 | class UrlsBaseFileStorage(BaseTextualFileStorage, metaclass=ABCMeta):
232 |     def __init__(self, filename):
233 |         self.filename = filename
234 | 
235 |     def shelljob_security_checks(self):
236 | 
237 |         if platform.system() == 'Windows':
238 |             return []
239 | 
240 |         shelljob_errors = []
241 |         current_uid = os.getuid()
242 | 
243 |         dirname = os.path.dirname(self.filename) or '.'
244 |         dir_st = os.stat(dirname)
245 |         if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
246 |             shelljob_errors.append('%s is group/world-writable' % dirname)
247 |         if dir_st.st_uid != current_uid:
248 |             shelljob_errors.append('%s not owned by %s' % (dirname, get_current_user()))
249 | 
250 |         file_st = os.stat(self.filename)
251 |         if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
252 |             shelljob_errors.append('%s is group/world-writable' % self.filename)
253 |         if file_st.st_uid != current_uid:
254 |             shelljob_errors.append('%s not owned by %s' % (self.filename, get_current_user()))
255 | 
256 |         return shelljob_errors
257 | 
258 |     def load_secure(self):
259 |         jobs = self.load()
260 | 
261 |         # Security checks for shell jobs - only execute if the current UID
262 |         # is the same as the file/directory owner and only owner can write
263 |         shelljob_errors = self.shelljob_security_checks()
264 |         if shelljob_errors and any(isinstance(job, ShellJob) for job in jobs):
265 |             print(('Removing shell jobs, because %s' % (' and '.join(shelljob_errors),)))
266 |             jobs = [job for job in jobs if not isinstance(job, ShellJob)]
267 | 
268 |         return jobs
269 | 
270 | 
271 | class BaseTxtFileStorage(BaseTextualFileStorage, metaclass=ABCMeta):
272 |     @classmethod
273 |     def parse(cls, *args):
274 |         filename = args[0]
275 |         if filename is not None and os.path.exists(filename):
276 |             with open(filename) as fp:
277 |                 for line in fp:
278 |                     line = line.strip()
279 |                     if not line or line.startswith('#'):
280 |                         continue
281 | 
282 |                     if line.startswith('|'):
283 |                         yield ShellJob(command=line[1:])
284 |                     else:
285 |                         args = line.split(None, 2)
286 |                         if len(args) == 1:
287 |                             yield UrlJob(url=args[0])
288 |                         elif len(args) == 2:
289 |                             yield UrlJob(url=args[0], post=args[1])
290 |                         else:
291 |                             raise ValueError('Unsupported line format: %r' % (line,))
292 | 
293 | 
294 | class BaseYamlFileStorage(BaseTextualFileStorage, metaclass=ABCMeta):
295 |     @classmethod
296 |     def parse(cls, *args):
297 |         filename = args[0]
298 |         if filename is not None and os.path.exists(filename):
299 |             with open(filename) as fp:
300 |                 return yaml.load(fp, Loader=yaml.SafeLoader)
301 | 
302 | 
303 | class YamlConfigStorage(BaseYamlFileStorage):
304 |     def load(self, *args):
305 |         self.config = merge(self.parse(self.filename) or {}, copy.deepcopy(DEFAULT_CONFIG))
306 | 
307 |     def save(self, *args):
308 |         with open(self.filename, 'w') as fp:
309 |             yaml.dump(self.config, fp, default_flow_style=False)
310 | 
311 | 
312 | class UrlsYaml(BaseYamlFileStorage, UrlsBaseFileStorage):
313 | 
314 |     @classmethod
315 |     def parse(cls, *args):
316 |         filename = args[0]
317 |         if filename is not None and os.path.exists(filename):
318 |             with open(filename) as fp:
319 |                 return [JobBase.unserialize(job) for job in yaml.load_all(fp, Loader=yaml.SafeLoader) if job is not None]
320 | 
321 |     def save(self, *args):
322 |         jobs = args[0]
323 |         print('Saving updated list to %r' % self.filename)
324 | 
325 |         with open(self.filename, 'w') as fp:
326 |             yaml.dump_all([job.serialize() for job in jobs], fp, default_flow_style=False)
327 | 
328 |     def load(self, *args):
329 |         with open(self.filename) as fp:
330 |             return [JobBase.unserialize(job) for job in yaml.load_all(fp, Loader=yaml.SafeLoader) if job is not None]
331 | 
332 | 
333 | class UrlsTxt(BaseTxtFileStorage, UrlsBaseFileStorage):
334 |     def load(self):
335 |         return list(self.parse(self.filename))
336 | 
337 |     def save(self, jobs):
338 |         print(jobs)
339 |         raise NotImplementedError()
340 | 
341 | 
342 | class CacheStorage(BaseFileStorage, metaclass=ABCMeta):
343 |     @abstractmethod
344 |     def close(self):
345 |         ...
346 | 
347 |     @abstractmethod
348 |     def get_guids(self):
349 |         ...
350 | 
351 |     @abstractmethod
352 |     def load(self, job, guid):
353 |         ...
354 | 
355 |     @abstractmethod
356 |     def save(self, job, guid, data, timestamp, tries, etag=None):
357 |         ...
358 | 
359 |     @abstractmethod
360 |     def delete(self, guid):
361 |         ...
362 | 
363 |     @abstractmethod
364 |     def clean(self, guid):
365 |         ...
366 | 
367 |     def backup(self):
368 |         for guid in self.get_guids():
369 |             data, timestamp, tries, etag = self.load(None, guid)
370 |             yield guid, data, timestamp, tries, etag
371 | 
372 |     def restore(self, entries):
373 |         for guid, data, timestamp, tries, etag in entries:
374 |             self.save(None, guid, data, timestamp, tries, etag)
375 | 
376 |     def gc(self, known_guids):
377 |         for guid in set(self.get_guids()) - set(known_guids):
378 |             print('Removing: {guid}'.format(guid=guid))
379 |             self.delete(guid)
380 | 
381 |         for guid in known_guids:
382 |             count = self.clean(guid)
383 |             if count > 0:
384 |                 print('Removed {count} old versions of {guid}'.format(count=count, guid=guid))
385 | 
386 | 
387 | class CacheDirStorage(CacheStorage):
388 |     def __init__(self, filename):
389 |         super().__init__(filename)
390 |         if not os.path.exists(filename):
391 |             os.makedirs(filename)
392 | 
393 |     def close(self):
394 |         #  No need to close
395 |         return 0
396 | 
397 |     def _get_filename(self, guid):
398 |         return os.path.join(self.filename, guid)
399 | 
400 |     def get_guids(self):
401 |         return os.listdir(self.filename)
402 | 
403 |     def load(self, job, guid):
404 |         filename = self._get_filename(guid)
405 |         if not os.path.exists(filename):
406 |             return None, None, None, None
407 | 
408 |         try:
409 |             with open(filename) as fp:
410 |                 data = fp.read()
411 |         except UnicodeDecodeError:
412 |             with open(filename, 'rb') as fp:
413 |                 data = fp.read().decode('utf-8', 'ignore')
414 | 
415 |         timestamp = os.stat(filename)[stat.ST_MTIME]
416 | 
417 |         return data, timestamp, None, None
418 | 
419 |     def save(self, job, guid, data, timestamp, etag=None):
420 |         # Timestamp and ETag are always ignored
421 |         filename = self._get_filename(guid)
422 |         with open(filename, 'w+') as fp:
423 |             fp.write(data)
424 | 
425 |     def delete(self, guid):
426 |         filename = self._get_filename(guid)
427 |         if os.path.exists(filename):
428 |             os.unlink(filename)
429 | 
430 |     def clean(self, guid):
431 |         # We only store the latest version, no need to clean
432 |         return 0
433 | 
434 | 
435 | class CacheEntry(minidb.Model):
436 |     guid = str
437 |     timestamp = int
438 |     data = str
439 |     tries = int
440 |     etag = str
441 | 
442 | 
443 | class CacheMiniDBStorage(CacheStorage):
444 |     def __init__(self, filename):
445 |         super().__init__(filename)
446 | 
447 |         dirname = os.path.dirname(filename)
448 |         if dirname and not os.path.isdir(dirname):
449 |             os.makedirs(dirname)
450 | 
451 |         self.db = minidb.Store(self.filename, debug=True)
452 |         self.db.register(CacheEntry)
453 | 
454 |     def close(self):
455 |         self.db.close()
456 |         self.db = None
457 | 
458 |     def get_guids(self):
459 |         return (guid for guid, in CacheEntry.query(self.db, minidb.Function('distinct', CacheEntry.c.guid)))
460 | 
461 |     def load(self, job, guid):
462 |         for data, timestamp, tries, etag in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp // CacheEntry.c.tries // CacheEntry.c.etag,
463 |                                                              order_by=minidb.columns(CacheEntry.c.timestamp.desc, CacheEntry.c.tries.desc),
464 |                                                              where=CacheEntry.c.guid == guid, limit=1):
465 |             return data, timestamp, tries, etag
466 | 
467 |         return None, None, 0, None
468 | 
469 |     def get_history_data(self, guid, count=1):
470 |         history = {}
471 |         if count < 1:
472 |             return history
473 |         for data, timestamp in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp,
474 |                                                 order_by=minidb.columns(CacheEntry.c.timestamp.desc, CacheEntry.c.tries.desc),
475 |                                                 where=(CacheEntry.c.guid == guid)
476 |                                                 & ((CacheEntry.c.tries == 0) | (CacheEntry.c.tries == None))):  # noqa
477 |             if data not in history:
478 |                 history[data] = timestamp
479 |                 if len(history) >= count:
480 |                     break
481 |         return history
482 | 
483 |     def save(self, job, guid, data, timestamp, tries, etag=None):
484 |         self.db.save(CacheEntry(guid=guid, timestamp=timestamp, data=data, tries=tries, etag=etag))
485 |         self.db.commit()
486 | 
487 |     def delete(self, guid):
488 |         CacheEntry.delete_where(self.db, CacheEntry.c.guid == guid)
489 |         self.db.commit()
490 | 
491 |     def clean(self, guid):
492 |         keep_id = next((CacheEntry.query(self.db, CacheEntry.c.id, where=CacheEntry.c.guid == guid,
493 |                                          order_by=CacheEntry.c.timestamp.desc, limit=1)), (None,))[0]
494 | 
495 |         if keep_id is not None:
496 |             result = CacheEntry.delete_where(self.db, (CacheEntry.c.guid == guid) & (CacheEntry.c.id != keep_id))
497 |             self.db.commit()
498 |             return result
499 | 
500 |         return 0
501 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/thp/urlwatch.svg)](https://travis-ci.org/thp/urlwatch)
  2 | [![Packaging status](https://repology.org/badge/tiny-repos/urlwatch.svg)](https://repology.org/metapackage/urlwatch/versions)
  3 | [![PyPI version](https://badge.fury.io/py/urlwatch.svg)](https://badge.fury.io/py/urlwatch)
  4 | 
  5 | 
  6 | ```
  7 |                          _               _       _       ____
  8 |               _   _ _ __| |_      ____ _| |_ ___| |__   |___ \
  9 |              | | | | '__| \ \ /\ / / _` | __/ __| '_ \    __) |
 10 |              | |_| | |  | |\ V  V / (_| | || (__| | | |  / __/
 11 |               \__,_|_|  |_| \_/\_/ \__,_|\__\___|_| |_| |_____|
 12 | 
 13 |                                   ... monitors webpages for you
 14 | ```
 15 | urlwatch is intended to help you watch changes in webpages and get notified
 16 | (via e-mail, in your terminal or through various third party services) of any
 17 | changes. The change notification will include the URL that has changed and
 18 | a unified diff of what has changed.
 19 | 
 20 | 
 21 | DEPENDENCIES
 22 | ------------
 23 | 
 24 | urlwatch 2 requires:
 25 | 
 26 |   * Python 3.5 or newer
 27 |   * [PyYAML](http://pyyaml.org/)
 28 |   * [minidb](https://thp.io/2010/minidb/)
 29 |   * [requests](http://python-requests.org/)
 30 |   * [keyring](https://github.com/jaraco/keyring/)
 31 |   * [appdirs](https://github.com/ActiveState/appdirs)
 32 |   * [lxml](https://lxml.de)
 33 |   * [cssselect](https://cssselect.readthedocs.io)
 34 |   * [enum34](https://pypi.org/project/enum34/) (Python 3.3 only)
 35 | 
 36 | The dependencies can be installed with (add `--user` to install to `$HOME`):
 37 | 
 38 | `python3 -m venv venv;source venv/bin/activate`
 39 | 
 40 | `python3 -m pip install -r requirements-dev.txt`
 41 | 
 42 | 
 43 | Optional dependencies (install via `python3 -m pip install <packagename>`):
 44 | 
 45 |   * Pushover reporter: [chump](https://github.com/karanlyons/chump/)
 46 |   * Pushbullet reporter: [pushbullet.py](https://github.com/randomchars/pushbullet.py)
 47 |   * Stdout reporter with color on Windows: [colorama](https://github.com/tartley/colorama)
 48 |   * "browser" job kind: [pyppeteer](https://github.com/miyakogi/pyppeteer), Python 3.6 or newer
 49 |   * Unit testing: [pycodestyle](http://pycodestyle.pycqa.org/en/latest/)
 50 | 
 51 | 
 52 | QUICK START
 53 | -----------
 54 | 
 55 |  1. Start `urlwatch` to migrate your old data or start fresh
 56 |  2. Use `urlwatch --edit` to customize your job list (this will create/edit `urls.yaml`)
 57 |  3. Use `urlwatch --edit-config` if you want to set up e-mail sending
 58 |  4. Use `urlwatch --edit-hooks` if you want to write custom subclasses
 59 |  5. Add `urlwatch` to your crontab (`crontab -e`) to monitor webpages periodically
 60 | 
 61 | The checking interval is defined by how often you run `urlwatch`.
 62 | You can use e.g. [crontab.guru](https://crontab.guru) to figure out the
 63 | schedule expression for the checking interval, we recommend not more often
 64 | than 30 minutes (this would be `*/30 * * * *`). If you have never used
 65 | cron before, check out the
 66 | [crontab command help](https://www.computerhope.com/unix/ucrontab.htm).
 67 | 
 68 | On Windows, `cron` is not installed by default. Use the
 69 | [Windows Task Scheduler](https://en.wikipedia.org/wiki/Windows_Task_Scheduler)
 70 | instead, or see [this StackOverflow question](https://stackoverflow.com/q/132971/1047040)
 71 | for alternatives.
 72 | 
 73 | 
 74 | TIPS AND TRICKS
 75 | ---------------
 76 | 
 77 | Quickly adding new URLs to the job list from the command line:
 78 | 
 79 | ```urlwatch --add url=http://example.org,name=Example```
 80 | 
 81 | You can pick only a given HTML element with the built-in filter, for
 82 | example to extract ```<div id="something">.../<div>``` from a page, you
 83 | can use the following in your urls.yaml:
 84 | ```yaml
 85 | url: http://example.org/
 86 | filter: element-by-id:something
 87 | ```
 88 | 
 89 | Also, you can chain filters, so you can run html2text on the result:
 90 | ```yaml
 91 | url: http://example.net/
 92 | filter: element-by-id:something,html2text
 93 | ```
 94 | 
 95 | The example urls.yaml file also demonstrates the use of built-in
 96 | filters, here 3 filters are used: html2text, line-grep and whitespace
 97 | removal to get just a certain info field from a webpage:
 98 | ```yaml
 99 | url: https://thp.io/2008/urlwatch/
100 | filter: html2text,grep:Current.*version,strip
101 | ```
102 | For most cases, this means that you can specify a filter chain in
103 | your urls.yaml page without requiring a custom hook where previously
104 | you would have needed to write custom filtering code in Python.
105 | 
106 | If you are using the `grep` filter, you can grep for a comma (`,`)
107 | by using `\054` (`:` does not need to be escaped separately and
108 | can be used as-is), for example to convert HTML to text, then grep
109 | for `a,b:`, and then strip whitespace, use this:
110 | 
111 | ```yaml
112 | url: https://example.org/
113 | filter: html2text,grep:a\054b:,strip
114 | ```
115 | 
116 | If you want to extract only the body tag you can use this filer:
117 | ```yaml
118 | url: https://thp.io/2008/urlwatch/
119 | filter: element-by-tag:body
120 | ```
121 | 
122 | You can also specify an external `diff`-style tool (a tool that takes
123 | two filenames (old, new) as parameter and returns on its standard output
124 | the difference of the files), for example to use GNU `wdiff` to get
125 | word-based differences instead of line-based difference:
126 | 
127 | ```yaml
128 | url: https://example.com/
129 | diff_tool: wdiff
130 | ```
131 | 
132 | Note that `diff_tool` specifies an external command-line tool, so that
133 | tool must be installed separately (e.g. `apt install wdiff` on Debian or
134 | `brew install wdiff` on macOS). Coloring is supported for `wdiff`-style
135 | output, but potentially not for other diff tools.
136 | 
137 | To filter based on an [XPath](https://www.w3.org/TR/1999/REC-xpath-19991116/)
138 | expression, you can use the `xpath` filter like so (see Microsoft's
139 | [XPath Examples](https://msdn.microsoft.com/en-us/library/ms256086(v=vs.110).aspx)
140 | page for some other examples):
141 | 
142 | ```yaml
143 | url: https://example.net/
144 | filter: xpath:/body
145 | ```
146 | 
147 | This filters only the `<body>` element of the HTML document, stripping
148 | out everything else.
149 | 
150 | To filter based on a [CSS selector](https://www.w3.org/TR/2011/REC-css3-selectors-20110929/),
151 | you can use the `css` filter like so:
152 | 
153 | ```yaml
154 | url: https://example.net/
155 | filter: css:body
156 | ```
157 | 
158 | Some limitations and extensions exist as explained in
159 | [cssselect's documentation](https://cssselect.readthedocs.io/en/latest/#supported-selectors).
160 | 
161 | In some cases, it might be useful to ignore (temporary) network errors to
162 | avoid notifications being sent. While there is a `display.error` config
163 | option (defaulting to `True`) to control reporting of errors globally, to
164 | ignore network errors for specific jobs only, you can use the
165 | `ignore_connection_errors` key in the job list configuration file:
166 | 
167 | ```yaml
168 | url: https://example.com/
169 | ignore_connection_errors: true
170 | ```
171 | 
172 | Similarly, you might want to ignore some (temporary) HTTP errors on the
173 | server side:
174 | 
175 | ```yaml
176 | url: https://example.com/
177 | ignore_http_error_codes: 408, 429, 500, 502, 503, 504
178 | ```
179 | 
180 | or ignore all HTTP errors if you like:
181 | 
182 | ```yaml
183 | url: https://example.com/
184 | ignore_http_error_codes: 4xx, 5xx
185 | ```
186 | 
187 | For web pages with misconfigured HTTP headers or rare encodings, it may
188 | be useful to explicitly specify an encoding from Python's
189 | [Standard Encodings](https://docs.python.org/3/library/codecs.html#standard-encodings).
190 | 
191 | ```yaml
192 | url: https://example.com/
193 | encoding: utf-8
194 | ```
195 | 
196 | By default, url jobs timeout after 60 seconds. If you want a different timeout
197 | period, use the `timeout` key to specify it in number of seconds, or set it to 0
198 | to never timeout.
199 | 
200 | ```yaml
201 | url: https://example.com/
202 | timeout: 300
203 | ```
204 | 
205 | If you want to change some settings for all your jobs, edit the `job_defaults`
206 | section in your config file:
207 | 
208 | ```yaml
209 | ...
210 | job_defaults:
211 |   all:
212 |     diff_tool: wdiff
213 |   url:
214 |     ignore_connection_errors: true
215 | ```
216 | The above config file sets all jobs to use wdiff as diff tool, and all "url" jobs
217 | to ignore connection errors.
218 | 
219 | PUSHOVER
220 | --------
221 | 
222 | You can configure urlwatch to send real time notifications about changes
223 | via Pushover(https://pushover.net/). To enable this, ensure you have the
224 | chump python package installed (see DEPENDENCIES). Then edit your config
225 | (`urlwatch --edit-config`) and enable pushover. You will also need to add
226 | to the config your Pushover user key and a unique app key (generated by
227 | registering urlwatch as an application on your Pushover account(https://pushover.net/apps/build). 
228 | 
229 | You can send to a specific device by using the device name, as indicated when
230 | you add or view your list of devices in the Pushover console.  For example
231 | `device:  'MyPhone'`, or `device: 'MyLaptop'`. To send to *all* of your
232 | devices, set `device: null` in your config (`urlwatch --edit-config`) or leave
233 | out the device configuration completely.
234 | 
235 | 
236 | PUSHBULLET
237 | --------
238 | 
239 | Pushbullet notifications are configured similarly to Pushover (see above).
240 | You'll need to add to the config your Pushbullet Access Token, which you 
241 | can generate at https://www.pushbullet.com/#settings
242 | 
243 | TELEGRAM
244 | --------
245 | 
246 | Telegram notifications are configured using the Telegram Bot API.
247 | For this, you'll need a Bot API token and a chat id (see https://core.telegram.org/bots).
248 | Sample configuration:
249 | 
250 | ```yaml
251 | telegram:
252 |   bot_token: '999999999:3tOhy2CuZE0pTaCtszRfKpnagOG8IQbP5gf' # your bot api token
253 |   chat_id: '88888888' # the chat id where the messages should be sent
254 |   enabled: true
255 | ```
256 | 
257 | To set up Telegram, from your Telegram app, chat up BotFather (New Message,
258 | Search, "BotFather"), then say `/newbot` and follow the instructions.
259 | Eventually it will tell you the bot token (in the form seen above,
260 | `<number>:<random string>`) - add this to your config file.
261 | 
262 | You can then click on the link of your bot, which will send the message `/start`.
263 | At this point, you can use the command `urlwatch --telegram-chats` to list the
264 | private chats the bot is involved with. This is the chat ID that you need to put
265 | into the config file as `chat_id`. You may add multiple chat IDs as a YAML list:
266 | ```yaml
267 | telegram:
268 |   bot_token: '999999999:3tOhy2CuZE0pTaCtszRfKpnagOG8IQbP5gf' # your bot api token
269 |   chat_id:
270 |     - '11111111'
271 |     - '22222222'
272 |   enabled: true
273 | ```
274 | 
275 | Don't forget to also enable the reporter.
276 | 
277 | 
278 | SLACK
279 | -----
280 | 
281 | Slack notifications are configured using "Slack Incoming Webhooks". Here is a
282 | sample configuration:
283 | 
284 | ```yaml
285 | slack:
286 |   webhook_url: 'https://hooks.slack.com/services/T50TXXXXXU/BDVYYYYYYY/PWTqwyFM7CcCfGnNzdyDYZ'
287 |   enabled: true
288 | ```
289 | 
290 | To set up Slack, from you Slack Team, create a new app and activate "Incoming Webhooks" on
291 | a channel, you'll get a webhook URL, copy it into the configuration as seen above.
292 | 
293 | You can use the command `urlwatch --test-slack` to test if the Slack integration works.
294 | 
295 | 
296 | BROWSER
297 | -------
298 | 
299 | If the webpage you are trying to watch runs client-side JavaScript to
300 | render the page, [Pyppeteer](https://github.com/miyakogi/pyppeteer) can
301 | now be used to render the page in a headless Chromium instance first
302 | and then use the HTML of the resulting page.
303 | 
304 | Use the `browser` kind in the configuration and the `navigate` key to set the
305 | URL to retrieve. note that the normal `url` job keys are not supported
306 | for the `browser` job types at the moment, for example:
307 | 
308 | ```yaml
309 | kind: browser
310 | name: "A Page With JavaScript"
311 | navigate: http://example.org/
312 | ```
313 | 
314 | Configure the optional `options` key to pass
315 | [additional options](https://miyakogi.github.io/pyppeteer/reference.html#pyppeteer.page.Page.goto)
316 | to Pyppeteer. For example:
317 | 
318 | ```yaml
319 | kind: browser
320 | name: "A Page With JavaScript"
321 | navigate: http://example.org/
322 | options:
323 |   waitUntil: networkidle0
324 | ```
325 | 
326 | 
327 | E-MAIL VIA GMAIL SMTP
328 | ---------------------
329 | 
330 | You need to configure your GMail account to allow for "less secure" (password-based)
331 | apps to login:
332 | 
333 | 1. Go to https://myaccount.google.com/
334 | 2. Click on "Sign-in & security"
335 | 3. Scroll all the way down to "Allow less secure apps" and enable it
336 | 
337 | Now, start the configuration editor: `urlwatch --edit-config`
338 | 
339 | These are the keys you need to configure (see #158):
340 | 
341 | - `report/email/enabled`: `true`
342 | - `report/email/from`: `your.username@gmail.com` (edit accordingly)
343 | - `report/email/method`: `smtp`
344 | - `report/email/smtp/host`: `smtp.gmail.com`
345 | - `report/email/smtp/auth`: `true`
346 | - `report/email/smtp/port`: `587`
347 | - `report/email/smtp/starttls`: `true`
348 | - `report/email/to`: The e-mail address you want to send reports to
349 | 
350 | Now, for setting the password, it's not stored in the config file, but in your
351 | keychain. To store the password, run: `urlwatch --smtp-login` and enter your
352 | password.
353 | 
354 | 
355 | E-MAIL VIA AMAZON SIMPLE EMAIL SERVICE (SES)
356 | --------------------------------------------
357 | 
358 | Start the configuration editor: `urlwatch --edit-config`
359 | 
360 | These are the keys you need to configure:
361 | 
362 | - `report/email/enabled`: `true`
363 | - `report/email/from`: `you@verified_domain.com` (edit accordingly)
364 | - `report/email/method`: `smtp`
365 | - `report/email/smtp/host`: `email-smtp.us-west-2.amazonaws.com` (edit accordingly)
366 | - `report/email/smtp/user`: `ABCDEFGHIJ1234567890` (edit accordingly)
367 | - `report/email/smtp/auth`: `true`
368 | - `report/email/smtp/port`: `587` (25 or 465 also work)
369 | - `report/email/smtp/starttls`: `true`
370 | - `report/email/to`: The e-mail address you want to send reports to
371 | 
372 | The password is not stored in the config file, but in your keychain. To store
373 | the password, run: `urlwatch --smtp-login` and enter your password.
374 | 
375 | 
376 | SMTP LOGIN WITHOUT KEYRING
377 | --------------------------
378 | 
379 | If for whatever reason you cannot use a keyring to store your password
380 | (for example, when using it from a `cron` job)
381 | you can also set the `insecure_password` option in the SMTP config:
382 | 
383 | - `report/email/smtp/auth`: `true`
384 | - `report/email/smtp/insecure_password`: `secret123`
385 | 
386 | The `insecure_password` key will be preferred over the data stored in
387 | the keyring. Please note that as the name says, storing the password
388 | as plaintext in the configuration is insecure and bad practice, but
389 | for an e-mail account that's only dedicated for sending mails this
390 | might be a way. **Never ever use this with your your primary
391 | e-mail account!** Seriously! Create a throw-away GMail (or other)
392 | account just for sending out those e-mails or use local `sendmail` with
393 | a mail server configured instead of relying on SMTP and password auth.
394 | 
395 | Note that this makes it really easy for your password to be picked up
396 | by software running on your machine, by other users logged into the system
397 | and/or for the password to appear in log files accidentally.
398 | 
399 | 
400 | TESTING FILTERS
401 | ---------------
402 | 
403 | While creating your filter pipeline, you might want to preview what the filtered
404 | output looks like. You can do so by first configuring your job and then running
405 | urlwatch with the `--test-filter` command, passing in the index (from `--list`)
406 | or the URL/location of the job to be tested:
407 | 
408 | ```
409 | urlwatch --test-filter 1   # Test the first job in the list
410 | urlwatch --test-filter https://example.net/  # Test the job with the given URL
411 | ```
412 | 
413 | The output of this command will be the filtered plaintext of the job, this is the
414 | output that will (in a real urlwatch run) be the input to the diff algorithm.
415 | 
416 | 
417 | SENDING COOKIES
418 | ---------------
419 | 
420 | It is possible to add cookies to HTTP requests for pages that need it, the YAML
421 | syntax for this is:
422 | 
423 | ```yaml
424 | url: http://example.com/
425 | cookies:
426 |     Key: ValueForKey
427 |     OtherKey: OtherValue
428 | ```
429 | 
430 | 
431 | WATCHING GITHUB RELEASES
432 | ------------------------
433 | 
434 | This is an example how to watch the GitHub "releases" page for a given
435 | project for the latest release version, to be notified of new releases:
436 | 
437 | ```yaml
438 | url: "https://github.com/thp/urlwatch/releases/latest"
439 | filter:
440 |   - xpath: '(//div[contains(@class,"release-timeline-tags")]//h4)[1]/a'
441 |   - html2text: re
442 | ```
443 | 
444 | 
445 | USING XPATH AND CSS FILTERS WITH XML AND EXCLUSIONS
446 | ---------------------------------------------------
447 | 
448 | By default, XPath and CSS filters are set up for HTML documents. However,
449 | it is possible to use them for XML documents as well (these examples parse
450 | an RSS feed and filter only the titles and publication dates):
451 | 
452 | ```yaml
453 | url: 'https://heronebag.com/blog/index.xml'
454 | filter:
455 |   - xpath:
456 |       path: '//item/title/text()|//item/pubDate/text()'
457 |       method: xml
458 | ```
459 | ```yaml
460 | url: 'https://heronebag.com/blog/index.xml'
461 | filter:
462 |   - css:
463 |       selector: 'item > title, item > pubDate'
464 |       method: xml
465 |   - html2text: re
466 | ```
467 | 
468 | To match an element in an [XML namespace](https://www.w3.org/TR/xml-names/),
469 | use a namespace prefix before the tag name. Use a `:` to seperate the namespace
470 | prefix and the tag name in an XPath expression, and use a `|` in a CSS selector.
471 | ```yaml
472 | url: 'https://www.wired.com/feed/rss'
473 | filter:
474 |   - xpath:
475 |       path: '//item/media:keywords'
476 |       method: xml
477 |       namespaces:
478 |         media: http://search.yahoo.com/mrss/
479 | ```
480 | ```yaml
481 | url: 'https://www.wired.com/feed/rss'
482 | filter:
483 |   - css:
484 |       selector: 'item > media|keywords'
485 |       method: xml
486 |       namespaces:
487 |         media: http://search.yahoo.com/mrss/
488 | ```
489 | Alternatively, use the XPath expression `//*[name()='<tag_name>']` to bypass
490 | the namespace entirely.
491 | 
492 | Another useful option with XPath and CSS filters is `exclude`. Elements selected
493 | by this `exclude` expression are removed from the final result. For example, the
494 | following job will not have any `<a>` tag in its results:
495 | 
496 | ```yaml
497 | url: https://example.org/
498 | filter:
499 |   - css:
500 |       selector: 'body'
501 |       exclude: 'a'
502 | ```
503 | 
504 | 
505 | COMPARE WITH SEVERAL LATEST SNAPSHOTS
506 | -------------------------------------
507 | If a webpage frequently changes between several known stable states, it may be
508 | desirable to have changes reported only if the webpage changes into a new
509 | unknown state. You can use `compared_versions` to do this.
510 | 
511 | ```yaml
512 | url: https://example.com/
513 | compared_versions: 3
514 | ```
515 | 
516 | In this example, changes are only reported if the webpage becomes different from
517 | the latest three distinct states. The differences are shown relative to the
518 | closest match.
519 | 
520 | 
521 | REMOVE OR REPLACE TEXT USING REGULAR EXPRESSIONS
522 | ------------------------------------------------
523 | 
524 | Just like Python's `re.sub` function, there's the possibility to apply a regular
525 | expression and either remove of replace the matched text. The following example
526 | applies the filter 3 times:
527 | 
528 |  1. Just specifying a string as the value will replace the matches with the empty string.
529 |  2. Simple patterns can be replaced with another string using "pattern" as the expression and "repl" as the replacement.
530 |  3. You can use groups (`()`) and back-reference them with `\1` (etc..) to put groups into the replacement string.
531 | 
532 | All features are described in Python's [re.sub](https://docs.python.org/3/library/re.html#re.sub)
533 | documentation (the `pattern` and `repl` values are passed to this function as-is, with the value
534 | of `repl` defaulting to the empty string).
535 | 
536 | 
537 | ```yaml
538 | kind: url
539 | url: https://example.com/
540 | filter:
541 |     - re.sub: '\s*href="[^"]*"'
542 |     - re.sub:
543 |         pattern: '<h1>'
544 |         repl: 'HEADING 1: '
545 |     - re.sub:
546 |         pattern: '</([^>]*)>'
547 |         repl: '<END OF TAG \1>'
548 | ```
549 | 
550 | 
551 | MIGRATION FROM URLWATCH 1.x
552 | ---------------------------
553 | 
554 | Migration from urlwatch 1.x should be automatic on first start. Here is a
555 | quick rundown of changes in 2.0:
556 | 
557 |  * URLs are stored in a YAML file now, with direct support for specifying
558 |    names for jobs, different job kinds, directly applying filters, selecting
559 |    the HTTP request method, specifying POST data as dictionary and much more
560 |  * The cache directory has been replaced with a SQLite 3 database file
561 |    "cache.db" in minidb format, storing all change history (use `--gc-cache` to
562 |    remove old changes if you don't need them anymore) for further analysis
563 |  * The hooks mechanism has been replaced with support for creating new job
564 |    kinds by subclassing, new filters (also by subclassing) as well as new
565 |    reporters (pieces of code that put the results somewhere, for example the
566 |    default installation contains the "stdout" reporter that writes to the
567 |    console and the "email" reporter that can send HTML and text e-mails)
568 |  * A configuration file - urlwatch.yaml - has been added for specifying user
569 |    preferences instead of having to supply everything via the command line
570 | 
571 | 
572 | CONTACT
573 | -------
574 | 
575 | Website: https://thp.io/2008/urlwatch/
576 | 
577 | E-Mail: m@thp.io
578 | 


--------------------------------------------------------------------------------
/lib/urlwatch/filters.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  4 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  5 | # All rights reserved.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions
  9 | # are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright
 12 | #    notice, this list of conditions and the following disclaimer.
 13 | # 2. Redistributions in binary form must reproduce the above copyright
 14 | #    notice, this list of conditions and the following disclaimer in the
 15 | #    documentation and/or other materials provided with the distribution.
 16 | # 3. The name of the author may not be used to endorse or promote products
 17 | #    derived from this software without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 20 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 21 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 24 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 28 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | 
 31 | import re
 32 | import logging
 33 | import itertools
 34 | import os
 35 | import imp
 36 | import html.parser
 37 | import hashlib
 38 | import json
 39 | 
 40 | # for OCR
 41 | from PIL import Image
 42 | from io import BytesIO
 43 | import pytesseract
 44 | 
 45 | from enum import Enum
 46 | from lxml import etree
 47 | from lxml.cssselect import CSSSelector
 48 | 
 49 | from .util import TrackSubClasses
 50 | 
 51 | logger = logging.getLogger(__name__)
 52 | 
 53 | 
 54 | class FilterBase(object, metaclass=TrackSubClasses):
 55 |     __subclasses__ = {}
 56 |     __anonymous_subclasses__ = []
 57 | 
 58 |     def __init__(self, job, state):
 59 |         self.job = job
 60 |         self.state = state
 61 | 
 62 |     def _no_subfilters(self, subfilter):
 63 |         if subfilter is not None:
 64 |             raise ValueError('No subfilters supported for {}'.format(self.__kind__))
 65 | 
 66 |     @classmethod
 67 |     def filter_documentation(cls):
 68 |         result = []
 69 |         for sc in TrackSubClasses.sorted_by_kind(cls):
 70 |             result.extend((
 71 |                 '  * %s - %s' % (sc.__kind__, sc.__doc__),
 72 |             ))
 73 |         return '\n'.join(result)
 74 | 
 75 |     @classmethod
 76 |     def auto_process(cls, state, data):
 77 |         filters = itertools.chain((filtercls for _, filtercls in
 78 |                                    sorted(cls.__subclasses__.items(), key=lambda k_v: k_v[0])),
 79 |                                   cls.__anonymous_subclasses__)
 80 | 
 81 |         for filtercls in filters:
 82 |             filter_instance = filtercls(state.job, state)
 83 |             if filter_instance.match():
 84 |                 logger.info('Auto-applying filter %r to %s', filter_instance, state.job.get_location())
 85 |                 data = filter_instance.filter(data)
 86 | 
 87 |         return data
 88 | 
 89 |     @classmethod
 90 |     def process(cls, filter_kind, subfilter, state, data):
 91 |         logger.info('Applying filter %r, subfilter %r to %s', filter_kind, subfilter, state.job.get_location())
 92 |         filtercls = cls.__subclasses__.get(filter_kind, None)
 93 |         if filtercls is None:
 94 |             raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter))
 95 |         return filtercls(state.job, state).filter(data, subfilter)
 96 | 
 97 |     def match(self):
 98 |         return False
 99 | 
100 |     def filter(self, data, subfilter=None):
101 |         raise NotImplementedError()
102 | 
103 | 
104 | class AutoMatchFilter(FilterBase):
105 |     """Automatically matches subclass filters with a given location"""
106 |     MATCH = None
107 | 
108 |     def match(self):
109 |         if self.MATCH is None:
110 |             return False
111 | 
112 |         d = self.job.to_dict()
113 |         result = all(d.get(k, None) == v for k, v in self.MATCH.items())
114 |         logger.debug('Matching %r with %r result: %r', self, self.job, result)
115 |         return result
116 | 
117 | 
118 | class RegexMatchFilter(FilterBase):
119 |     """Same as AutoMatchFilter but matching is done with regexes"""
120 |     MATCH = None
121 | 
122 |     def match(self):
123 |         if self.MATCH is None:
124 |             return False
125 | 
126 |         d = self.job.to_dict()
127 | 
128 |         # It's a match if we have at least one key/value pair that matches,
129 |         # and no key/value pairs that do not match
130 |         matches = [v.match(d[k]) for k, v in self.MATCH.items() if k in d]
131 |         result = len(matches) > 0 and all(matches)
132 |         logger.debug('Matching %r with %r result: %r', self, self.job, result)
133 |         return result
134 | 
135 | 
136 | class LegacyHooksPyFilter(FilterBase):
137 |     FILENAME = os.path.expanduser('~/.urlwatch/lib/hooks.py')
138 | 
139 |     def __init__(self, job, state):
140 |         super().__init__(job, state)
141 | 
142 |         self.hooks = None
143 |         if os.path.exists(self.FILENAME):
144 |             try:
145 |                 self.hooks = imp.load_source('legacy_hooks', self.FILENAME)
146 |             except Exception as e:
147 |                 logger.error('Could not load legacy hooks file: %s', e)
148 | 
149 |     def match(self):
150 |         return self.hooks is not None
151 | 
152 |     def filter(self, data, subfilter=None):
153 |         try:
154 |             result = self.hooks.filter(self.job.get_location(), data)
155 |             if result is None:
156 |                 result = data
157 |             return result
158 |         except Exception as e:
159 |             logger.warn('Could not apply legacy hooks filter: %s', e)
160 |             return data
161 | 
162 | 
163 | class Html2TextFilter(FilterBase):
164 |     """Convert HTML to plaintext"""
165 | 
166 |     __kind__ = 'html2text'
167 | 
168 |     def filter(self, data, subfilter=None):
169 | 
170 |         if subfilter is None:
171 |             method = 're'
172 |             options = {}
173 |         elif isinstance(subfilter, dict):
174 |             method = subfilter.pop('method')
175 |             options = subfilter
176 |         elif isinstance(subfilter, str):
177 |             method = subfilter
178 |             options = {}
179 |         from .html2txt import html2text
180 |         return html2text(data, method=method, options=options)
181 | 
182 | 
183 | class Ical2TextFilter(FilterBase):
184 |     """Convert iCalendar to plaintext"""
185 | 
186 |     __kind__ = 'ical2text'
187 | 
188 |     def filter(self, data, subfilter=None):
189 |         self._no_subfilters(subfilter)
190 |         from .ical2txt import ical2text
191 |         return ical2text(data)
192 | 
193 | 
194 | class JsonFormatFilter(FilterBase):
195 |     """Convert to formatted json"""
196 | 
197 |     __kind__ = 'format-json'
198 | 
199 |     def filter(self, data, subfilter=None):
200 |         indentation = 4
201 |         if subfilter is not None:
202 |             indentation = int(subfilter)
203 |         parsed_json = json.loads(data)
204 |         return json.dumps(parsed_json, ensure_ascii=False, sort_keys=True, indent=indentation, separators=(',', ': '))
205 | 
206 | 
207 | class GrepFilter(FilterBase):
208 |     """Filter only lines matching a regular expression"""
209 | 
210 |     __kind__ = 'grep'
211 | 
212 |     def filter(self, data, subfilter=None):
213 |         if subfilter is None:
214 |             raise ValueError('The grep filter needs a regular expression')
215 | 
216 |         return '\n'.join(line for line in data.splitlines()
217 |                          if re.search(subfilter, line) is not None)
218 | 
219 | 
220 | class InverseGrepFilter(FilterBase):
221 |     """Filter which removes lines matching a regular expression"""
222 | 
223 |     __kind__ = 'grepi'
224 | 
225 |     def filter(self, data, subfilter=None):
226 |         if subfilter is None:
227 |             raise ValueError('The inverse grep filter needs a regular expression')
228 | 
229 |         return '\n'.join(line for line in data.splitlines()
230 |                          if re.search(subfilter, line) is None)
231 | 
232 | 
233 | class StripFilter(FilterBase):
234 |     """Strip leading and trailing whitespace"""
235 | 
236 |     __kind__ = 'strip'
237 | 
238 |     def filter(self, data, subfilter=None):
239 |         self._no_subfilters(subfilter)
240 |         return data.strip()
241 | 
242 | 
243 | class CleanNewLinesFilter(FilterBase):
244 |     """Clean up excessive newlines"""
245 | 
246 |     __kind__ = 'clean-new-lines'
247 | 
248 |     def filter(self, data, subfilter=None):
249 |         self._no_subfilters(subfilter)
250 | 
251 |         lines = data.split("\n")
252 |         non_empty_lines = [line for line in lines if line.strip() != ""]
253 |         string_without_empty_lines = ""
254 |         for line in non_empty_lines:
255 |               string_without_empty_lines += line + "\n"
256 |         return string_without_empty_lines.strip()
257 | 
258 | 
259 | class FilterBy(Enum):
260 |     ATTRIBUTE = 1
261 |     TAG = 2
262 | 
263 | 
264 | class ElementsBy(html.parser.HTMLParser):
265 |     def __init__(self, filter_by, name, value=None):
266 |         super().__init__()
267 | 
268 |         self._filter_by = filter_by
269 |         if self._filter_by == FilterBy.ATTRIBUTE:
270 |             self._attributes = {name: value}
271 |         else:
272 |             self._name = name
273 | 
274 |         self._result = []
275 |         self._inside = False
276 |         self._elts = []
277 | 
278 |     def get_html(self):
279 |         return ''.join(self._result)
280 | 
281 |     def handle_starttag(self, tag, attrs):
282 |         ad = dict(attrs)
283 | 
284 |         if self._filter_by == FilterBy.ATTRIBUTE and all(ad.get(k, None) == v for k, v in self._attributes.items()):
285 |             self._inside = True
286 |         elif self._filter_by == FilterBy.TAG and tag == self._name:
287 |             self._inside = True
288 | 
289 |         if self._inside:
290 |             self._result.append('<%s%s%s>' % (tag, ' ' if attrs else '',
291 |                                               ' '.join('%s="%s"' % (k, v) for k, v in attrs)))
292 |             self._elts.append(tag)
293 | 
294 |     def handle_endtag(self, tag):
295 |         if self._inside:
296 |             self._result.append('</%s>' % (tag,))
297 |             if tag in self._elts:
298 |                 t = self._elts.pop()
299 |                 while t != tag and self._elts:
300 |                     t = self._elts.pop()
301 |             if not self._elts:
302 |                 self._inside = False
303 | 
304 |     def handle_data(self, data):
305 |         if self._inside:
306 |             self._result.append(data)
307 | 
308 | 
309 | class GetElementById(FilterBase):
310 |     """Get an HTML element by its ID"""
311 | 
312 |     __kind__ = 'element-by-id'
313 | 
314 |     def filter(self, data, subfilter=None):
315 |         if subfilter is None:
316 |             raise ValueError('Need an element ID for filtering')
317 | 
318 |         element_by_id = ElementsBy(FilterBy.ATTRIBUTE, 'id', subfilter)
319 |         element_by_id.feed(data)
320 |         return element_by_id.get_html()
321 | 
322 | 
323 | class GetElementByClass(FilterBase):
324 |     """Get all HTML elements by class"""
325 | 
326 |     __kind__ = 'element-by-class'
327 | 
328 |     def filter(self, data, subfilter=None):
329 |         if subfilter is None:
330 |             raise ValueError('Need an element class for filtering')
331 | 
332 |         element_by_class = ElementsBy(FilterBy.ATTRIBUTE, 'class', subfilter)
333 |         element_by_class.feed(data)
334 |         return element_by_class.get_html()
335 | 
336 | 
337 | class GetElementByStyle(FilterBase):
338 |     """Get all HTML elements by style"""
339 | 
340 |     __kind__ = 'element-by-style'
341 | 
342 |     def filter(self, data, subfilter=None):
343 |         if subfilter is None:
344 |             raise ValueError('Need an element style for filtering')
345 | 
346 |         element_by_style = ElementsBy(FilterBy.ATTRIBUTE, 'style', subfilter)
347 |         element_by_style.feed(data)
348 |         return element_by_style.get_html()
349 | 
350 | 
351 | class GetElementByTag(FilterBase):
352 |     """Get an HTML element by its tag"""
353 | 
354 |     __kind__ = 'element-by-tag'
355 | 
356 |     def filter(self, data, subfilter=None):
357 |         if subfilter is None:
358 |             raise ValueError('Need a tag for filtering')
359 | 
360 |         element_by_tag = ElementsBy(FilterBy.TAG, subfilter)
361 |         element_by_tag.feed(data)
362 |         return element_by_tag.get_html()
363 | 
364 | class OCRFilter(FilterBase):
365 |     """Return pytesseract results"""
366 | 
367 |     __kind__ = 'ocr'
368 | 
369 |     def filter(self, data, subfilter=None):
370 |         image_data = Image.open(BytesIO(data))
371 |         return pytesseract.image_to_string(image_data)
372 | 
373 | class Sha1Filter(FilterBase):
374 |     """Calculate the SHA-1 checksum of the content"""
375 | 
376 |     __kind__ = 'sha1sum'
377 | 
378 |     def filter(self, data, subfilter=None):
379 |         self._no_subfilters(subfilter)
380 |         sha = hashlib.sha1()
381 |         sha.update(data.encode('utf-8', 'ignore'))
382 |         return sha.hexdigest()
383 | 
384 | 
385 | class HexdumpFilter(FilterBase):
386 |     """Convert binary data to hex dump format"""
387 | 
388 |     __kind__ = 'hexdump'
389 | 
390 |     def filter(self, data, subfilter=None):
391 |         self._no_subfilters(subfilter)
392 |         data = bytearray(data.encode('utf-8', 'ignore'))
393 |         blocks = [data[i * 16:(i + 1) * 16] for i in range(int((len(data) + (16 - 1)) / 16))]
394 |         return '\n'.join('%s  %s' % (' '.join('%02x' % c for c in block),
395 |                                      ''.join((chr(c) if (c > 31 and c < 127) else '.')
396 |                                              for c in block)) for block in blocks)
397 | 
398 | 
399 | class LxmlParser:
400 |     EXPR_NAMES = {'css': 'a CSS selector',
401 |                   'xpath': 'an XPath expression'}
402 | 
403 |     def __init__(self, filter_kind, subfilter, expr_key):
404 |         self.filter_kind = filter_kind
405 |         if subfilter is None:
406 |             raise ValueError('Need %s for filtering' % (self.EXPR_NAMES[filter_kind],))
407 |         if isinstance(subfilter, str):
408 |             self.expression = subfilter
409 |             self.method = 'html'
410 |             self.exclude = None
411 |             self.namespaces = None
412 |         elif isinstance(subfilter, dict):
413 |             if expr_key not in subfilter:
414 |                 raise ValueError('Need %s for filtering' % (self.EXPR_NAMES[filter_kind],))
415 |             self.expression = subfilter[expr_key]
416 |             self.method = subfilter.get('method', 'html')
417 |             self.exclude = subfilter.get('exclude')
418 |             self.namespaces = subfilter.get('namespaces')
419 |             if self.method not in ('html', 'xml'):
420 |                 raise ValueError('%s method must be "html" or "xml", got %r' % (filter_kind, self.method))
421 |             if self.method == 'html' and self.namespaces is not None:
422 |                 raise ValueError('Namespace prefixes only supported with "xml" method.')
423 |         else:
424 |             raise ValueError('%s subfilter must be a string or dict' % (filter_kind,))
425 |         self.parser = (etree.HTMLParser if self.method == 'html' else etree.XMLParser)()
426 |         self.data = ''
427 | 
428 |     def feed(self, data):
429 |         self.data += data
430 | 
431 |     def _to_string(self, element):
432 |         # Handle "/text()" selector, which returns lxml.etree._ElementUnicodeResult (Issue #282)
433 |         if isinstance(element, str):
434 |             return element
435 | 
436 |         return etree.tostring(element, pretty_print=True, method=self.method, encoding='unicode', with_tail=False)
437 | 
438 |     @staticmethod
439 |     def _remove_element(element):
440 |         parent = element.getparent()
441 |         if parent is None:
442 |             # Do not exclude root element
443 |             return
444 |         if isinstance(element, etree._ElementUnicodeResult):
445 |             if element.is_tail:
446 |                 parent.tail = None
447 |             elif element.is_text:
448 |                 parent.text = None
449 |             elif element.is_attribute:
450 |                 del parent.attrib[element.attrname]
451 |         else:
452 |             previous = element.getprevious()
453 |             if element.tail is not None:
454 |                 if previous is not None:
455 |                     previous.tail = previous.tail + element.tail if previous.tail else element.tail
456 |                 else:
457 |                     parent.text = parent.text + element.tail if parent.text else element.tail
458 |             parent.remove(element)
459 | 
460 |     def _reevaluate(self, element):
461 |         if self._orphaned(element):
462 |             return None
463 |         if isinstance(element, etree._ElementUnicodeResult):
464 |             parent = element.getparent()
465 |             if parent is None:
466 |                 return element
467 |             if element.is_tail:
468 |                 return parent.tail
469 |             elif element.is_text:
470 |                 return parent.text
471 |             elif element.is_attribute:
472 |                 return parent.attrib.get(element.attrname)
473 |         else:
474 |             return element
475 | 
476 |     def _orphaned(self, element):
477 |         if isinstance(element, etree._ElementUnicodeResult):
478 |             parent = element.getparent()
479 |             if ((element.is_tail and parent.tail is None)
480 |                     or (element.is_text and parent.text is None)
481 |                     or (element.is_attribute and parent.attrib.get(element.attrname) is None)):
482 |                 return True
483 |             else:
484 |                 element = parent
485 |         try:
486 |             tree = element.getroottree()
487 |             path = tree.getpath(element)
488 |             return element is not tree.xpath(path, namespaces=self.namespaces)[0]
489 |         except (ValueError, IndexError):
490 |             return True
491 | 
492 |     def _get_filtered_elements(self):
493 |         try:
494 |             root = etree.fromstring(self.data, self.parser)
495 |         except ValueError:
496 |             # Strip XML declaration, for example: '<?xml version="1.0" encoding="utf-8"?>'
497 |             # for https://heronebag.com/blog/index.xml, an error happens, as we get a
498 |             # a (Unicode) string, but the XML contains its own "encoding" declaration
499 |             self.data = re.sub(r'^<[?]xml[^>]*[?]>', '', self.data)
500 |             # Retry parsing with XML declaration removed (Fixes #281)
501 |             root = etree.fromstring(self.data, self.parser)
502 |         if root is None:
503 |             return []
504 |         excluded_elems = None
505 |         if self.filter_kind == 'css':
506 |             selected_elems = CSSSelector(self.expression,
507 |                                          namespaces=self.namespaces).evaluate(root)
508 |             excluded_elems = CSSSelector(self.exclude,
509 |                                          namespaces=self.namespaces).evaluate(root) if self.exclude else None
510 |         elif self.filter_kind == 'xpath':
511 |             selected_elems = root.xpath(self.expression, namespaces=self.namespaces)
512 |             excluded_elems = root.xpath(self.exclude, namespaces=self.namespaces) if self.exclude else None
513 |         if excluded_elems is not None:
514 |             for el in excluded_elems:
515 |                 self._remove_element(el)
516 |         return [el for el in map(self._reevaluate, selected_elems) if el is not None]
517 | 
518 |     def get_filtered_data(self):
519 |         filtered_data = '\n'.join(self._to_string(element) for element in self._get_filtered_elements())
520 |         # TODO: get parent self.job object and pass here
521 |         # if not filtered_data.strip():
522 |         #     raise ValueError('%s did not match any markup for job: %r' % (self.filter_kind, self.job))
523 |         return filtered_data
524 | 
525 | 
526 | class CssFilter(FilterBase):
527 |     """Filter XML/HTML using CSS selectors"""
528 | 
529 |     __kind__ = 'css'
530 | 
531 |     def filter(self, data, subfilter=None):
532 |         lxml_parser = LxmlParser('css', subfilter, 'selector')
533 |         lxml_parser.feed(data)
534 |         filtered_data = lxml_parser.get_filtered_data()
535 |         # TODO: move this raise higher in class
536 |         if not filtered_data.strip():
537 |             raise ValueError('CssFilter did not match any markup for job: {}'.format(self.job))
538 |         return filtered_data
539 | 
540 | 
541 | class XPathFilter(FilterBase):
542 |     """Filter XML/HTML using XPath expressions"""
543 | 
544 |     __kind__ = 'xpath'
545 | 
546 |     def filter(self, data, subfilter=None):
547 |         lxml_parser = LxmlParser('xpath', subfilter, 'path')
548 |         lxml_parser.feed(data)
549 |         filtered_data = lxml_parser.get_filtered_data()
550 |         # TODO: move this raise higher in class
551 |         if not filtered_data.strip():
552 |             raise ValueError('XPathFilter did not match any markup for job: {}'.format(self.job))
553 |         return filtered_data
554 | 
555 | 
556 | class RegexSub(FilterBase):
557 |     """Replace text with regular expressions using Python's re.sub"""
558 | 
559 |     __kind__ = 're.sub'
560 | 
561 |     def filter(self, data, subfilter=None):
562 |         if subfilter is None:
563 |             raise ValueError('{} needs a subfilter'.format(self.__kind__))
564 | 
565 |         # Allow for just specifying a regular expression (that will be removed)
566 |         if isinstance(subfilter, str):
567 |             subfilter = {'pattern': subfilter}
568 | 
569 |         # Default: Replace with empty string if no "repl" value is set
570 |         return re.sub(subfilter.get('pattern'), subfilter.get('repl', ''), data)
571 | 


--------------------------------------------------------------------------------
/lib/urlwatch/reporters.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is part of urlwatch (https://thp.io/2008/urlwatch/).
  3 | # Copyright (c) 2008-2019 Thomas Perl <m@thp.io>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions
  8 | # are met:
  9 | #
 10 | # 1. Redistributions of source code must retain the above copyright
 11 | #    notice, this list of conditions and the following disclaimer.
 12 | # 2. Redistributions in binary form must reproduce the above copyright
 13 | #    notice, this list of conditions and the following disclaimer in the
 14 | #    documentation and/or other materials provided with the distribution.
 15 | # 3. The name of the author may not be used to endorse or promote products
 16 | #    derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 19 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 21 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 23 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | import difflib
 31 | import tempfile
 32 | import subprocess
 33 | import re
 34 | import shlex
 35 | import email.utils
 36 | import itertools
 37 | import logging
 38 | import os
 39 | import sys
 40 | import time
 41 | import html
 42 | import functools
 43 | 
 44 | import requests
 45 | 
 46 | import urlwatch
 47 | from .mailer import SMTPMailer
 48 | from .mailer import SendmailMailer
 49 | from .util import TrackSubClasses
 50 | 
 51 | try:
 52 |     import chump
 53 | except ImportError:
 54 |     chump = None
 55 | 
 56 | try:
 57 |     from pushbullet import Pushbullet
 58 | except ImportError:
 59 |     Pushbullet = None
 60 | 
 61 | logger = logging.getLogger(__name__)
 62 | 
 63 | 
 64 | # Regular expressions that match the added/removed markers of GNU wdiff output
 65 | WDIFF_ADDED_RE = r'[{][+].*?[+][}]'
 66 | WDIFF_REMOVED_RE = r'[\[][-].*?[-][]]'
 67 | 
 68 | 
 69 | class ReporterBase(object, metaclass=TrackSubClasses):
 70 |     __subclasses__ = {}
 71 | 
 72 |     def __init__(self, report, config, job_states, duration):
 73 |         self.report = report
 74 |         self.config = config
 75 |         self.job_states = job_states
 76 |         self.duration = duration
 77 | 
 78 |     def convert(self, othercls):
 79 |         if hasattr(othercls, '__kind__'):
 80 |             config = self.report.config['report'][othercls.__kind__]
 81 |         else:
 82 |             config = {}
 83 | 
 84 |         return othercls(self.report, config, self.job_states, self.duration)
 85 | 
 86 |     @classmethod
 87 |     def reporter_documentation(cls):
 88 |         result = []
 89 |         for sc in TrackSubClasses.sorted_by_kind(cls):
 90 |             result.extend((
 91 |                 '  * %s - %s' % (sc.__kind__, sc.__doc__),
 92 |             ))
 93 |         return '\n'.join(result)
 94 | 
 95 |     @classmethod
 96 |     def submit_all(cls, report, job_states, duration):
 97 |         any_enabled = False
 98 |         for name, subclass in cls.__subclasses__.items():
 99 |             cfg = report.config['report'].get(name, {'enabled': False})
100 |             if cfg['enabled']:
101 |                 any_enabled = True
102 |                 logger.info('Submitting with %s (%r)', name, subclass)
103 |                 subclass(report, cfg, job_states, duration).submit()
104 | 
105 |         if not any_enabled:
106 |             logger.warn('No reporters enabled.')
107 | 
108 |     def submit(self):
109 |         raise NotImplementedError()
110 | 
111 |     def unified_diff(self, job_state):
112 |         if job_state.job.diff_tool is not None:
113 |             with tempfile.TemporaryDirectory() as tmpdir:
114 |                 old_file_path = os.path.join(tmpdir, 'old_file')
115 |                 new_file_path = os.path.join(tmpdir, 'new_file')
116 |                 with open(old_file_path, 'w+b') as old_file, open(new_file_path, 'w+b') as new_file:
117 |                     old_file.write(job_state.old_data.encode('utf-8'))
118 |                     new_file.write(job_state.new_data.encode('utf-8'))
119 |                 cmdline = shlex.split(job_state.job.diff_tool) + [old_file_path, new_file_path]
120 |                 proc = subprocess.Popen(cmdline, stdout=subprocess.PIPE)
121 |                 stdout, _ = proc.communicate()
122 |                 # Diff tools return 0 for "nothing changed" or 1 for "files differ", anything else is an error
123 |                 if proc.returncode in (0, 1):
124 |                     return stdout.decode('utf-8')
125 |                 else:
126 |                     raise subprocess.CalledProcessError(proc.returncode, cmdline)
127 | 
128 |         timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1)
129 |         timestamp_new = email.utils.formatdate(time.time(), localtime=1)
130 |         return ''.join(difflib.unified_diff([l + '\n' for l in job_state.old_data.splitlines()],
131 |                                             [l + '\n' for l in job_state.new_data.splitlines()],
132 |                                             '@', '@', timestamp_old, timestamp_new))
133 | 
134 | 
135 | class SafeHtml(object):
136 |     def __init__(self, s):
137 |         self.s = s
138 | 
139 |     def __str__(self):
140 |         return self.s
141 | 
142 |     def format(self, *args, **kwargs):
143 |         return str(self).format(*(html.escape(str(arg)) for arg in args),
144 |                                 **{k: html.escape(str(v)) for k, v in kwargs.items()})
145 | 
146 | 
147 | class HtmlReporter(ReporterBase):
148 |     def submit(self):
149 |         yield from (str(part) for part in self._parts())
150 | 
151 |     def _parts(self):
152 |         cfg = self.report.config['report']['html']
153 | 
154 |         yield SafeHtml("""<!DOCTYPE html>
155 |         <html><head>
156 |             <title>urlwatch</title>
157 |             <meta http-equiv="content-type" content="text/html; charset=utf-8">
158 |             <meta name="viewport" content="width=device-width, initial-scale=1.0">
159 |             <style type="text/css">
160 |                 body { font-family: sans-serif; }
161 |                 .diff_add { color: green; background-color: lightgreen; }
162 |                 .diff_sub { color: red; background-color: lightred; }
163 |                 .diff_chg { color: orange; background-color: lightyellow; }
164 |                 .unified_add { color: green; }
165 |                 .unified_sub { color: red; }
166 |                 .unified_nor { color: #333; }
167 |                 table { font-family: monospace; }
168 |                 h2 span.verb { color: #888; }
169 |             </style>
170 |         </head><body>
171 |         """)
172 | 
173 |         for job_state in self.report.get_filtered_job_states(self.job_states):
174 |             job = job_state.job
175 | 
176 |             if job.LOCATION_IS_URL:
177 |                 title = '<a href="{location}">{pretty_name}</a>'
178 |             elif job.pretty_name() != job.get_location():
179 |                 title = '<span title="{location}">{pretty_name}</span>'
180 |             else:
181 |                 title = '{location}'
182 |             title = '<h2><span class="verb">{verb}:</span> ' + title + '</h2>'
183 | 
184 |             yield SafeHtml(title).format(verb=job_state.verb,
185 |                                          location=job.get_location(),
186 |                                          pretty_name=job.pretty_name())
187 | 
188 |             content = self._format_content(job_state, cfg['diff'])
189 |             if content is not None:
190 |                 yield content
191 | 
192 |             yield SafeHtml('<hr>')
193 | 
194 |         yield SafeHtml("""
195 |         <address>
196 |         {pkgname} {version}, {copyright}<br>
197 |         Website: {url}<br>
198 |         watched {count} URLs in {duration} seconds
199 |         </address>
200 |         </body>
201 |         </html>
202 |         """).format(pkgname=urlwatch.pkgname, version=urlwatch.__version__, copyright=urlwatch.__copyright__,
203 |                     url=urlwatch.__url__, count=len(self.job_states), duration=self.duration.seconds)
204 | 
205 |     def _diff_to_html(self, unified_diff):
206 |         for line in unified_diff.splitlines():
207 |             if line.startswith('+'):
208 |                 yield SafeHtml('<span class="unified_add">{line}</span>').format(line=line)
209 |             elif line.startswith('-'):
210 |                 yield SafeHtml('<span class="unified_sub">{line}</span>').format(line=line)
211 |             else:
212 |                 yield SafeHtml('<span class="unified_nor">{line}</span>').format(line=line)
213 | 
214 |     def _format_content(self, job_state, difftype):
215 |         if job_state.verb == 'error':
216 |             return SafeHtml('<pre style="text-color: red;">{error}</pre>').format(error=job_state.traceback.strip())
217 | 
218 |         if job_state.verb == 'unchanged':
219 |             return SafeHtml('<pre>{old_data}</pre>').format(old_data=job_state.old_data)
220 | 
221 |         if job_state.old_data in (None, job_state.new_data):
222 |             return SafeHtml('...')
223 | 
224 |         if difftype == 'table':
225 |             timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1)
226 |             timestamp_new = email.utils.formatdate(time.time(), localtime=1)
227 |             html_diff = difflib.HtmlDiff()
228 |             return SafeHtml(html_diff.make_table(job_state.old_data.splitlines(1), job_state.new_data.splitlines(1),
229 |                                                  timestamp_old, timestamp_new, True, 3))
230 |         elif difftype == 'unified':
231 |             return ''.join((
232 |                 '<pre>',
233 |                 '\n'.join(self._diff_to_html(self.unified_diff(job_state))),
234 |                 '</pre>',
235 |             ))
236 |         else:
237 |             raise ValueError('Diff style not supported: %r' % (difftype,))
238 | 
239 | 
240 | class TextReporter(ReporterBase):
241 |     def submit(self):
242 |         cfg = self.report.config['report']['text']
243 |         line_length = cfg['line_length']
244 |         show_details = cfg['details']
245 |         show_footer = cfg['footer']
246 | 
247 |         if cfg['minimal']:
248 |             for job_state in self.report.get_filtered_job_states(self.job_states):
249 |                 pretty_name = job_state.job.pretty_name()
250 |                 location = job_state.job.get_location()
251 |                 if pretty_name != location:
252 |                     location = '%s ( %s )' % (pretty_name, location)
253 |                 yield ': '.join((job_state.verb.upper(), location))
254 |             return
255 | 
256 |         summary = []
257 |         details = []
258 |         for job_state in self.report.get_filtered_job_states(self.job_states):
259 |             summary_part, details_part = self._format_output(job_state, line_length)
260 |             summary.extend(summary_part)
261 |             details.extend(details_part)
262 | 
263 |         if summary:
264 |             sep = (line_length * '=') or None
265 |             yield from (part for part in itertools.chain(
266 |                 (sep,),
267 |                 ('%02d. %s' % (idx + 1, line) for idx, line in enumerate(summary)),
268 |                 (sep, ''),
269 |             ) if part is not None)
270 | 
271 |         if show_details:
272 |             yield from details
273 | 
274 |         if summary and show_footer:
275 |             yield from ('-- ',
276 |                         '%s %s, %s' % (urlwatch.pkgname, urlwatch.__version__, urlwatch.__copyright__),
277 |                         'Website: %s' % (urlwatch.__url__,),
278 |                         'watched %d URLs in %d seconds' % (len(self.job_states), self.duration.seconds))
279 | 
280 |     def _format_content(self, job_state):
281 |         if job_state.verb == 'error':
282 |             return job_state.traceback.strip()
283 | 
284 |         if job_state.verb == 'unchanged':
285 |             return job_state.old_data
286 | 
287 |         if job_state.old_data in (None, job_state.new_data):
288 |             return None
289 | 
290 |         return self.unified_diff(job_state)
291 | 
292 |     def _format_output(self, job_state, line_length):
293 |         summary_part = []
294 |         details_part = []
295 | 
296 |         pretty_name = job_state.job.pretty_name()
297 |         location = job_state.job.get_location()
298 |         if pretty_name != location:
299 |             location = '%s ( %s )' % (pretty_name, location)
300 | 
301 |         pretty_summary = ': '.join((job_state.verb.upper(), pretty_name))
302 |         summary = ': '.join((job_state.verb.upper(), location))
303 |         content = self._format_content(job_state)
304 | 
305 |         summary_part.append(pretty_summary)
306 | 
307 |         sep = (line_length * '-') or None
308 |         details_part.extend((sep, summary, sep))
309 |         if content is not None:
310 |             details_part.extend((content, sep))
311 |         details_part.extend(('', '') if sep else ('',))
312 |         details_part = [part for part in details_part if part is not None]
313 | 
314 |         return summary_part, details_part
315 | 
316 | 
317 | class StdoutReporter(TextReporter):
318 |     """Print summary on stdout (the console)"""
319 | 
320 |     __kind__ = 'stdout'
321 | 
322 |     def __init__(self, *args, **kwargs):
323 |         super().__init__(*args, **kwargs)
324 |         self._has_color = sys.stdout.isatty() and self.config.get('color', False)
325 | 
326 |     def _incolor(self, color_id, s):
327 |         if self._has_color:
328 |             return '\033[9%dm%s\033[0m' % (color_id, s)
329 |         return s
330 | 
331 |     def _red(self, s):
332 |         return self._incolor(1, s)
333 | 
334 |     def _green(self, s):
335 |         return self._incolor(2, s)
336 | 
337 |     def _yellow(self, s):
338 |         return self._incolor(3, s)
339 | 
340 |     def _blue(self, s):
341 |         return self._incolor(4, s)
342 | 
343 |     def _get_print(self):
344 |         if sys.platform == 'win32' and self._has_color:
345 |             from colorama import AnsiToWin32
346 |             return functools.partial(print, file=AnsiToWin32(sys.stdout).stream)
347 |         return print
348 | 
349 |     def submit(self):
350 |         print = self._get_print()
351 | 
352 |         cfg = self.report.config['report']['text']
353 |         line_length = cfg['line_length']
354 | 
355 |         separators = (line_length * '=', line_length * '-', '-- ') if line_length else ()
356 |         body = '\n'.join(super().submit())
357 | 
358 |         for line in body.splitlines():
359 |             # Basic colorization for wdiff-style differences
360 |             line = re.sub(WDIFF_ADDED_RE, lambda x: self._green(x.group(0)), line)
361 |             line = re.sub(WDIFF_REMOVED_RE, lambda x: self._red(x.group(0)), line)
362 | 
363 |             # FIXME: This isn't ideal, but works for now...
364 |             if line in separators:
365 |                 print(line)
366 |             elif line.startswith('+'):
367 |                 print(self._green(line))
368 |             elif line.startswith('-'):
369 |                 print(self._red(line))
370 |             elif any(line.startswith(prefix) for prefix in ('NEW:', 'CHANGED:', 'UNCHANGED:', 'ERROR:')):
371 |                 first, second = line.split(' ', 1)
372 |                 if line.startswith('ERROR:'):
373 |                     print(first, self._red(second))
374 |                 else:
375 |                     print(first, self._blue(second))
376 |             else:
377 |                 print(line)
378 | 
379 | 
380 | class EMailReporter(TextReporter):
381 |     """Send summary via e-mail / SMTP"""
382 | 
383 |     __kind__ = 'email'
384 | 
385 |     def submit(self):
386 |         filtered_job_states = list(self.report.get_filtered_job_states(self.job_states))
387 | 
388 |         subject_args = {
389 |             'count': len(filtered_job_states),
390 |             'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states),
391 |         }
392 |         subject = self.config['subject'].format(**subject_args)
393 | 
394 |         body_text = '\n'.join(super().submit())
395 | 
396 |         if not body_text:
397 |             logger.debug('Not sending e-mail (no changes)')
398 |             return
399 |         if self.config['method'] == "smtp":
400 |             smtp_user = self.config['smtp'].get('user', None) or self.config['from']
401 |             # Legacy support: The current smtp "auth" setting was previously called "keyring"
402 |             if 'keyring' in self.config['smtp']:
403 |                 logger.info('The SMTP config key "keyring" is now called "auth".')
404 |             use_auth = self.config['smtp'].get('auth', self.config['smtp'].get('keyring', False))
405 |             mailer = SMTPMailer(smtp_user, self.config['smtp']['host'], self.config['smtp']['port'],
406 |                                 self.config['smtp']['starttls'], use_auth,
407 |                                 self.config['smtp'].get('insecure_password'))
408 |         elif self.config['method'] == "sendmail":
409 |             mailer = SendmailMailer(self.config['sendmail']['path'])
410 |         else:
411 |             logger.error('Invalid entry for method {method}'.format(method=self.config['method']))
412 | 
413 |         if self.config['html']:
414 |             body_html = '\n'.join(self.convert(HtmlReporter).submit())
415 | 
416 |             msg = mailer.msg_html(self.config['from'], self.config['to'], subject, body_text, body_html)
417 |         else:
418 |             msg = mailer.msg_plain(self.config['from'], self.config['to'], subject, body_text)
419 | 
420 |         mailer.send(msg)
421 | 
422 | 
423 | class WebServiceReporter(TextReporter):
424 |     MAX_LENGTH = 1024
425 | 
426 |     def web_service_get(self):
427 |         raise NotImplementedError
428 | 
429 |     def web_service_submit(self, service, title, body):
430 |         raise NotImplementedError
431 | 
432 |     def submit(self):
433 |         body_text = '\n'.join(super().submit())
434 | 
435 |         if not body_text:
436 |             logger.debug('Not sending %s (no changes)', self.__kind__)
437 |             return
438 | 
439 |         if len(body_text) > self.MAX_LENGTH:
440 |             body_text = body_text[:self.MAX_LENGTH]
441 | 
442 |         try:
443 |             service = self.web_service_get()
444 |         except Exception as e:
445 |             logger.error('Failed to load or connect to %s - are the dependencies installed and configured?',
446 |                          self.__kind__, exc_info=True)
447 |             return
448 | 
449 |         self.web_service_submit(service, 'Website Change Detected', body_text)
450 | 
451 | 
452 | class PushoverReport(WebServiceReporter):
453 |     """Send summary via pushover.net"""
454 | 
455 |     __kind__ = 'pushover'
456 | 
457 |     def web_service_get(self):
458 |         app = chump.Application(self.config['app'])
459 |         return app.get_user(self.config['user'])
460 | 
461 |     def web_service_submit(self, service, title, body):
462 |         sound = self.config['sound']
463 |         # If device is the empty string or not specified at all, use None to send to all devices
464 |         # (see https://github.com/thp/urlwatch/issues/372)
465 |         device = self.config.get('device', None) or None
466 |         msg = service.create_message(title=title, message=body, html=True, sound=sound, device=device)
467 |         msg.send()
468 | 
469 | 
470 | class PushbulletReport(WebServiceReporter):
471 |     """Send summary via pushbullet.com"""
472 | 
473 |     __kind__ = 'pushbullet'
474 | 
475 |     def web_service_get(self):
476 |         return Pushbullet(self.config['api_key'])
477 | 
478 |     def web_service_submit(self, service, title, body):
479 |         service.push_note(title, body)
480 | 
481 | 
482 | class MailGunReporter(TextReporter):
483 |     """Custom email reporter that uses Mailgun"""
484 | 
485 |     __kind__ = 'mailgun'
486 | 
487 |     def submit(self):
488 |         region = self.config.get('region', '')
489 |         domain = self.config['domain']
490 |         api_key = self.config['api_key']
491 |         from_name = self.config['from_name']
492 |         from_mail = self.config['from_mail']
493 |         to = self.config['to']
494 | 
495 |         if region == 'us':
496 |             region = ''
497 | 
498 |         if region != '':
499 |             region = ".{0}".format(region)
500 | 
501 |         filtered_job_states = list(self.report.get_filtered_job_states(self.job_states))
502 |         subject_args = {
503 |             'count': len(filtered_job_states),
504 |             'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states),
505 |         }
506 |         subject = self.config['subject'].format(**subject_args)
507 | 
508 |         body_text = '\n'.join(super().submit())
509 |         body_html = '\n'.join(self.convert(HtmlReporter).submit())
510 | 
511 |         if not body_text:
512 |             logger.debug('Not calling Mailgun API (no changes)')
513 |             return
514 | 
515 |         logger.debug("Sending Mailgun request for domain:'{0}'".format(domain))
516 |         result = requests.post(
517 |             "https://api{0}.mailgun.net/v3/{1}/messages".format(region, domain),
518 |             auth=("api", api_key),
519 |             data={"from": "{0} <{1}>".format(from_name, from_mail),
520 |                   "to": to,
521 |                   "subject": subject,
522 |                   "text": body_text,
523 |                   "html": body_html})
524 | 
525 |         try:
526 |             json_res = result.json()
527 | 
528 |             if (result.status_code == requests.codes.ok):
529 |                 logger.info("Mailgun response: id '{0}'. {1}".format(json_res['id'], json_res['message']))
530 |             else:
531 |                 logger.error("Mailgun error: {0}".format(json_res['message']))
532 |         except ValueError:
533 |             logger.error(
534 |                 "Failed to parse Mailgun response. HTTP status code: {0}, content: {1}".format(result.status_code,
535 |                                                                                                result.content))
536 | 
537 |         return result
538 | 
539 | 
540 | class TelegramReporter(TextReporter):
541 |     """Custom Telegram reporter"""
542 |     MAX_LENGTH = 4096
543 | 
544 |     __kind__ = 'telegram'
545 | 
546 |     def submit(self):
547 | 
548 |         bot_token = self.config['bot_token']
549 |         chat_ids = self.config['chat_id']
550 |         chat_ids = [chat_ids] if isinstance(chat_ids, str) else chat_ids
551 | 
552 |         text = '\n'.join(super().submit())
553 | 
554 |         if not text:
555 |             logger.debug('Not calling telegram API (no changes)')
556 |             return
557 | 
558 |         result = None
559 |         for chunk in self.chunkstring(text, self.MAX_LENGTH):
560 |             for chat_id in chat_ids:
561 |                 res = self.submitToTelegram(bot_token, chat_id, chunk)
562 |                 if res.status_code != requests.codes.ok or res is None:
563 |                     result = res
564 | 
565 |         return result
566 | 
567 |     def submitToTelegram(self, bot_token, chat_id, text):
568 |         logger.debug("Sending telegram request to chat id:'{0}'".format(chat_id))
569 |         result = requests.post(
570 |             "https://api.telegram.org/bot{0}/sendMessage".format(bot_token),
571 |             data={"chat_id": chat_id, "text": text, "disable_web_page_preview": "true"})
572 |         try:
573 |             json_res = result.json()
574 | 
575 |             if (result.status_code == requests.codes.ok):
576 |                 logger.info("Telegram response: ok '{0}'. {1}".format(json_res['ok'], json_res['result']))
577 |             else:
578 |                 logger.error("Telegram error: {0}".format(json_res['description']))
579 |         except ValueError:
580 |             logger.error(
581 |                 "Failed to parse telegram response. HTTP status code: {0}, content: {1}".format(result.status_code,
582 |                                                                                                 result.content))
583 |         return result
584 | 
585 |     def chunkstring(self, string, length):
586 |         return (string[0 + i:length + i] for i in range(0, len(string), length))
587 | 
588 | 
589 | class SlackReporter(TextReporter):
590 |     """Custom Slack reporter"""
591 |     MAX_LENGTH = 40000
592 | 
593 |     __kind__ = 'slack'
594 | 
595 |     def submit(self):
596 |         webhook_url = self.config['webhook_url']
597 |         text = '\n'.join(super().submit())
598 | 
599 |         if not text:
600 |             logger.debug('Not calling slack API (no changes)')
601 |             return
602 | 
603 |         result = None
604 |         for chunk in self.chunkstring(text, self.MAX_LENGTH):
605 |             res = self.submit_to_slack(webhook_url, chunk)
606 |             if res.status_code != requests.codes.ok or res is None:
607 |                 result = res
608 | 
609 |         return result
610 | 
611 |     def submit_to_slack(self, webhook_url, text):
612 |         logger.debug("Sending slack request with text:{0}".format(text))
613 |         post_data = {"text": text}
614 |         result = requests.post(webhook_url, json=post_data)
615 |         try:
616 |             if result.status_code == requests.codes.ok:
617 |                 logger.info("Slack response: ok")
618 |             else:
619 |                 logger.error("Slack error: {0}".format(result.text))
620 |         except ValueError:
621 |             logger.error(
622 |                 "Failed to parse slack response. HTTP status code: {0}, content: {1}".format(result.status_code,
623 |                                                                                              result.content))
624 |         return result
625 | 
626 |     def chunkstring(self, string, length):
627 |         return (string[0 + i:length + i] for i in range(0, len(string), length))
628 |         
629 | class WebhookReporter(TextReporter):
630 |     """Custom IFTTT webhook reporter for Google Sheets"""
631 |     
632 |     __kind__ = 'webhook'
633 |     
634 |     def submit(self):
635 |         result = None
636 |         for job_state in self.report.get_filtered_job_states(self.job_states):
637 |             pretty_name = job_state.job.pretty_name()
638 |             location = job_state.job.get_location()
639 |             content = self._format_content(job_state)
640 |             
641 |             res = self.post_report(pretty_name, location, content)
642 |             if res.status_code != requests.codes.ok or res is None:
643 |                 result = res
644 |         return result
645 |  
646 |     def post_report(self, pretty_name, location, content):
647 |         webhook_url = self.config['webhook_url']
648 |         post_data = {'value1': pretty_name, 'value2': location, 'value3': content}
649 |         result = requests.post(webhook_url, json=post_data)
650 |         
651 |         try:
652 |             if result.status_code == requests.codes.ok:
653 |                 logger.info("Webhook response: ok")
654 |             else:
655 |                 logger.error("Webhook error: {0}".format(result.text))
656 |         except ValueError:
657 |             logger.error(
658 |                 "Failed to parse webook response. HTTP status code: {0}, content: {1}".format(result.status_code,
659 |                                                                                              result.content))
660 |         return result
661 | 


--------------------------------------------------------------------------------