├── finance_dl
    ├── __init__.py
    ├── chromedriver_wrapper.py
    ├── ofx_rename.py
    ├── csv_merge.py
    ├── google_login.py
    ├── cli.py
    ├── google_takeout.py
    ├── ebmud.py
    ├── google_purchases.py
    ├── anthem.py
    ├── pge.py
    ├── comcast.py
    ├── update.py
    ├── stockplanconnect.py
    ├── waveapps.py
    ├── ultipro_google.py
    ├── paypal.py
    ├── amazon.py
    ├── venmo.py
    ├── healthequity.py
    ├── ofx.py
    ├── scrape_lib.py
    └── mint.py
├── .gitignore
├── mypy.ini
├── .travis.yml
├── tox.ini
├── .bumpversion.cfg
├── tests
    └── test_ofx.py
├── setup.py
├── .style.yapf
├── README.md
├── example_finance_dl_config.py
└── LICENSE


/finance_dl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.pyc
3 | .mypy_cache
4 | .tox
5 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | warn_unused_configs = True
3 | ignore_missing_imports = True
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: python
3 | python:
4 |   - "3.5"
5 |   - "3.6"
6 | install: pip install tox-travis
7 | script: tox
8 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |     py37
 4 | 
 5 | [testenv]
 6 | deps =
 7 |      mypy
 8 | 
 9 | commands =
10 |      pytest .
11 |      mypy finance_dl
12 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.3.2
 3 | tag = True
 4 | commit = True
 5 | message = chore: update package version to {new_version}
 6 | 
 7 | [bumpversion:file:setup.py]
 8 | search = version='{current_version}'
 9 | replace = version='{new_version}'
10 | 
11 | 


--------------------------------------------------------------------------------
/finance_dl/chromedriver_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Runs chromedriver in a new process group.
 2 | 
 3 | This prevents it from being killed when typing Control+c in an interactive
 4 | shell.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import chromedriver_binary
10 | 
11 | 
12 | def main():
13 | 
14 |     try:
15 |         os.setpgrp()
16 |     except:
17 |         # os.setpgrp not available on Windows
18 |         pass
19 | 
20 |     executable_path = os.getenv('ACTUAL_CHROMEDRIVER_PATH', 'chromedriver')
21 |     os.execvp(executable_path, [executable_path] + sys.argv[1:])
22 | 


--------------------------------------------------------------------------------
/tests/test_ofx.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from finance_dl.ofx import sanitize_account_name
 4 | 
 5 | 
 6 | def test_sanitize_account_name_disallows_dot():
 7 |     with pytest.raises(ValueError):
 8 |         sanitize_account_name('.')
 9 | 
10 | 
11 | def test_sanitize_account_name_disallows_double_dot():
12 |     with pytest.raises(ValueError):
13 |         sanitize_account_name('..')
14 | 
15 | 
16 | def test_sanitize_account_name_passes_through_standard_characters():
17 |     account_name = 'abc1234.5678-90-XYZ'
18 | 
19 |     assert sanitize_account_name(account_name) == account_name
20 | 
21 | 
22 | def test_sanitize_account_name_replaces_invalid_characters():
23 |     assert sanitize_account_name('1234$!5678:XYZ') == '1234-5678-XYZ'
24 | 


--------------------------------------------------------------------------------
/finance_dl/ofx_rename.py:
--------------------------------------------------------------------------------
 1 | """Renames improperly-named OFX files generated by the finance_dl.ofx module"""
 2 | import argparse
 3 | import os
 4 | 
 5 | import bs4
 6 | 
 7 | from .ofx import get_ofx_date_range
 8 | 
 9 | 
10 | def fix_name(path, dry_run):
11 |     name = os.path.basename(path)
12 |     d = os.path.dirname(path)
13 |     date_format = '%Y%m%d'
14 | 
15 |     parts = name.split('-')
16 |     assert len(parts) == 4
17 | 
18 |     with open(path, 'rb') as f:
19 |         date_range = get_ofx_date_range(f.read())
20 |     new_parts = [
21 |         date_range[0].strftime(date_format), date_range[1].strftime(date_format)
22 |     ] + parts[2:]
23 |     new_name = '-'.join(new_parts)
24 |     if new_name != name:
25 |         new_path = os.path.join(d, new_name)
26 |         print('Rename %s -> %s' % (path, new_path))
27 |         if not dry_run:
28 |             os.rename(path, new_path)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     ap = argparse.ArgumentParser()
33 |     ap.add_argument('paths', nargs='*')
34 |     args = ap.parse_args()
35 |     ap.add_argument('--dry-run', action='store_true')
36 |     for path in args.paths:
37 |         fix_name(path, dry_run=args.dry_run)
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | with open(os.path.join(os.path.dirname(__file__), 'README.md'), 'r') as f:
 5 |     long_description = f.read()
 6 | 
 7 | setup(
 8 |     name='finance-dl',
 9 |     description='Tools for scraping personal financial data.',
10 |     long_description=long_description,
11 |     long_description_content_type='text/markdown',
12 |     version='1.3.2',
13 |     url='https://github.com/jbms/finance-dl',
14 |     author='Jeremy Maitin-Shepard',
15 |     author_email="jeremy@jeremyms.com",
16 |     license='GPLv2',
17 |     packages=["finance_dl"],
18 |     entry_points={
19 |         'console_scripts': [
20 |             'finance-dl = finance_dl.cli:main',
21 |             'finance-dl-chromedriver-wrapper = finance_dl.chromedriver_wrapper:main',
22 |         ],
23 |     },
24 |     python_requires='>=3.5',
25 |     install_requires=[
26 |         'bs4',
27 |         'mintapi>=1.31',
28 |         'ofxclient',
29 |         'selenium',
30 |         'ipython',
31 |         'selenium-requests',
32 |         'chromedriver-binary',
33 |         'beancount>=2.1.2',
34 |         'atomicwrites>=1.3.0',
35 |         'jsonschema',
36 |     ],
37 |     tests_require=[
38 |         'pytest',
39 |     ]
40 | )
41 | 


--------------------------------------------------------------------------------
/finance_dl/csv_merge.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import collections
 3 | import os
 4 | 
 5 | from atomicwrites import atomic_write
 6 | 
 7 | 
 8 | def merge_overlapping_csv_rows(csv_data_list, compare_fields):
 9 |     """Merge overlapping CSV files.
10 | 
11 |     Rows are compared based on the list 'compare_fields' of field names.
12 |     The number of duplicate copies of a row kept in the result is equal
13 |     to the maximum number of duplicates in any single file.
14 | 
15 |     :param csv_data_list: list of rows, each row being represented by a
16 |         dict
17 |     :param compare_fields: list of field names by which duplicates are
18 |         detected.
19 | 
20 |     :return: Returns the merged list of rows.
21 |     """
22 | 
23 |     def convert_row(row):
24 |         return tuple(row[field] for field in compare_fields)
25 | 
26 |     merged_counter = collections.Counter()
27 |     merged_rows = []
28 |     for csv_data in csv_data_list:
29 |         cur_counter = collections.Counter()
30 |         for row in csv_data:
31 |             converted_row = convert_row(row)
32 |             cur_counter[converted_row] += 1
33 |             if cur_counter[converted_row] > merged_counter[converted_row]:
34 |                 merged_rows.append(row)
35 |                 merged_counter[converted_row] += 1
36 |     return merged_rows
37 | 
38 | 
39 | def write_csv(field_names, data, filename):
40 |     with atomic_write(filename, mode='w', newline='', encoding='utf-8') as f:
41 |         csv_writer = csv.DictWriter(
42 |             f, field_names, lineterminator='\n', quoting=csv.QUOTE_ALL)
43 |         csv_writer.writeheader()
44 |         csv_writer.writerows(data)
45 | 
46 | 
47 | def merge_into_file(filename,
48 |                     field_names,
49 |                     data,
50 |                     sort_by=None,
51 |                     compare_fields=None):
52 |     if compare_fields is None:
53 |         compare_fields = field_names
54 | 
55 |     if os.path.exists(filename):
56 |         with open(filename, 'r', newline='', encoding='utf-8') as f:
57 |             reader = csv.DictReader(f)
58 |             assert reader.fieldnames == field_names, (reader.fieldnames, field_names)
59 |             existing_rows = list(reader)
60 |         data = merge_overlapping_csv_rows([existing_rows, data],
61 |                                           compare_fields=compare_fields)
62 |     if sort_by is not None:
63 |         data.sort(key=sort_by)
64 |     write_csv(field_names=field_names, data=data, filename=filename)
65 | 


--------------------------------------------------------------------------------
/finance_dl/google_login.py:
--------------------------------------------------------------------------------
 1 | """Handles Google account login."""
 2 | 
 3 | import logging
 4 | from typing import Dict, cast, Any
 5 | 
 6 | from selenium.webdriver.common.by import By
 7 | from selenium.webdriver.common.keys import Keys
 8 | 
 9 | from . import scrape_lib
10 | 
11 | logger = logging.getLogger('google_login')
12 | 
13 | 
14 | def login(scraper: scrape_lib.Scraper, login_url: str):
15 |     logger.info('Initiating log in')
16 |     with scraper.wait_for_page_load():
17 |         scraper.driver.get(login_url)
18 | 
19 |     cur_url = scraper.driver.current_url
20 |     if not cur_url.startswith('https://accounts.google.com/'):
21 |         logger.info('Assuming already logged in due to url of %s', cur_url)
22 |         return
23 | 
24 |     logger.info('Waiting for username or password field')
25 | 
26 |     def find_username_or_other_account_button():
27 |         username = scraper.find_visible_elements(By.XPATH,
28 |                                                  '//input[@type="email"]')
29 |         password = scraper.find_visible_elements(By.XPATH,
30 |                                                  '//input[@type="password"]')
31 |         other_account = scraper.find_visible_elements(
32 |             By.XPATH, '//div[text()="Use another account"]')
33 |         if len(username) == 1:
34 |             return username[0], None, None
35 |         if len(password) == 1:
36 |             return None, password[0], None
37 |         if len(other_account) == 1:
38 |             return None, None, other_account[0]
39 |         return None
40 | 
41 |     (username, password, other_account_button
42 |      ), = scraper.wait_and_return(find_username_or_other_account_button)
43 |     if other_account_button:
44 |         scraper.click(other_account_button)
45 |         (username, ), = scraper.wait_and_return(
46 |             lambda: scraper.find_visible_elements(By.XPATH, '//input[@type="email"]')
47 |         )
48 |     credentials = cast(Any, scraper).credentials  # type:  Dict[str, str]
49 |     if not password:
50 |         logger.info('Entering username')
51 |         username.send_keys(credentials['username'])
52 |         username.send_keys(Keys.ENTER)
53 |         logger.info('Waiting for password field')
54 |         (password, ), = scraper.wait_and_return(
55 |             lambda: scraper.find_visible_elements(By.XPATH, '//input[@type="password"]')
56 |         )
57 |     logger.info('Entering password')
58 |     password.send_keys(credentials['password'])
59 |     password.send_keys(Keys.ENTER)
60 | 


--------------------------------------------------------------------------------
/finance_dl/cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | import logging
 4 | import json
 5 | 
 6 | 
 7 | def get_log_level(name):
 8 |     name = name.upper()
 9 |     numeric_level = getattr(logging, name, None)
10 |     if not isinstance(numeric_level, int):
11 |         raise ValueError('Invalid log level: %s' % name)
12 |     return name
13 | 
14 | 
15 | def main():
16 |     ap = argparse.ArgumentParser()
17 |     ap.add_argument('--config-module', type=str,
18 |                     help='Python module defining CONFIG_<name> functions.')
19 |     spec_group = ap.add_mutually_exclusive_group(required=True)
20 |     spec_group.add_argument('--config', '-c', type=str,
21 |                             help='Configuration name to use.')
22 |     spec_group.add_argument('--spec', '-s', type=json.loads,
23 |                             help='JSON configuration specification')
24 |     ap.add_argument('--interactive', '-i', action='store_true', default=False,
25 |                     help='Start interactive shell.')
26 |     ap.add_argument(
27 |         '--visible', action='store_true', help=
28 |         'Run with a visible browser (if applicable).  Implied by --interactive.'
29 |     )
30 |     ap.add_argument('--log', type=get_log_level, default=logging.INFO,
31 |                     help='Log level.')
32 |     args = ap.parse_args()
33 |     logging.basicConfig(
34 |         level=args.log,
35 |         format='%(asctime)s %(filename)s:%(lineno)d [%(levelname)s] %(message)s')
36 | 
37 |     if args.config_module:
38 |         config_module = importlib.import_module(args.config_module)
39 |     else:
40 |         config_module = object()
41 | 
42 |     if args.config:
43 |         key_prefix = 'CONFIG_'
44 |         config_key = key_prefix + args.config
45 |         if config_key is None:
46 |             valid_keys = sorted(
47 |                 k for k in vars(config_module) if k.startswith(key_prefix))
48 |             raise KeyError(
49 |                 'Invalid configuration key: %r.  Valid configuration keys: %r.'
50 |                 % (config_key, valid_keys))
51 |         spec = getattr(config_module, config_key, None)()
52 |     else:
53 |         spec = args.spec
54 |     module_name = spec.pop('module')
55 |     module = importlib.import_module(module_name)
56 | 
57 |     headless = not args.visible
58 |     if args.interactive:
59 |         headless = False
60 |     spec.setdefault('headless', headless)
61 | 
62 |     if args.interactive:
63 | 
64 |         def run_interactive_shell(**ns):
65 |             import IPython
66 |             user_ns = dict(vars(module), **ns)
67 | 
68 |             # Don't leave __name__ set, as that causes IPython to override the
69 |             # real module's entry in sys.modules.
70 |             user_ns.pop('__name__', None)
71 |             IPython.terminal.ipapp.launch_new_instance(
72 |                 argv=[
73 |                     '--no-banner',
74 |                     '--no-autoindent',
75 |                     '--InteractiveShellApp.exec_lines=["%load_ext autoreload", "%autoreload 2"]',
76 |                 ],
77 |                 user_ns=user_ns,
78 |             )
79 | 
80 |         interactive_func = getattr(module, 'interactive', None)
81 |         if interactive_func is not None:
82 |             with interactive_func(**spec) as ns:
83 |                 run_interactive_shell(**ns)
84 |         else:
85 |             run_interactive_shell()
86 |     else:
87 |         module.run(**spec)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/finance_dl/google_takeout.py:
--------------------------------------------------------------------------------
 1 | """Retrieves Google data using https://takeout.google.com
 2 | 
 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
 4 | scrape the Google Takeout website.
 5 | 
 6 | This is not itself a finance_dl data source, but is used by the
 7 | `finance_dl.google_purchases` module.
 8 | """
 9 | 
10 | from typing import List, Any, Iterable, FrozenSet
11 | import urllib.parse
12 | import re
13 | import io
14 | import logging
15 | import time
16 | import zipfile
17 | from selenium.webdriver.common.by import By
18 | from . import scrape_lib
19 | from . import google_login
20 | 
21 | logger = logging.getLogger('google_takeout')
22 | 
23 | netloc_re = r'^([^\.@]+\.)*google.com$'
24 | 
25 | 
26 | def check_url(url):
27 |     result = urllib.parse.urlparse(url)
28 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
29 |         raise RuntimeError('Reached invalid URL: %r' % url)
30 | 
31 | 
32 | class Scraper(scrape_lib.Scraper):
33 |     def __init__(self, credentials: dict, **kwargs):
34 |         super().__init__(**kwargs)
35 |         self.credentials = credentials
36 | 
37 |     def check_after_wait(self):
38 |         check_url(self.driver.current_url)
39 | 
40 |     def _get_categories(self):
41 |         categories, = self.wait_and_return(lambda: self.driver.find_elements(
42 |             By.XPATH, '//input[@type="checkbox"]'))
43 |         return categories
44 | 
45 |     def _create_archive(self, categories: FrozenSet[str]):
46 |         logger.info('Selecting categories')
47 |         checkboxes = self._get_categories()
48 |         found_ids = set()
49 |         for checkbox in checkboxes:
50 |             value = checkbox.get_attribute('value')
51 |             found_ids.add(value)
52 |             wanted = value in categories
53 |             checked = checkbox.get_attribute('checked') == 'true'
54 |             if wanted != checked:
55 |                 checkbox.click()
56 |         remaining = categories - found_ids
57 |         if remaining:
58 |             raise RuntimeError(
59 |                 'Categories not found: %s' % ', '.join(sorted(remaining)))
60 |         logger.info('Creating archive')
61 |         checkboxes[0].submit()
62 | 
63 |     def _get_download_links(self):
64 |         download_links = self.driver.find_elements(By.XPATH,
65 |                                                    '//a[.="Download"]')
66 |         return [x.get_attribute('href') for x in download_links]
67 | 
68 |     def get_takeout_zipfile(self, categories: Iterable[str]) -> zipfile.ZipFile:
69 |         """Returns a zipfile containing the specified takeout categories."""
70 |         google_login.login(self,
71 |                            'https://takeout.google.com/settings/takeout/light')
72 |         # Wait for at least one checkbox
73 |         self._get_categories()
74 |         # Wait 2 seconds to be sure all have loaded and then get new checkboxes
75 |         time.sleep(2)
76 |         # Get existing download links
77 |         download_links = self._get_download_links()
78 |         self._create_archive(categories=frozenset(categories))
79 | 
80 |         for attempt_i in range(3):
81 |             logger.info('Waiting for new download links (attempt %d)',
82 |                         attempt_i + 1)
83 |             # Wait 10 seconds for the archive to be created
84 |             time.sleep(10)
85 |             with self.wait_for_page_load():
86 |                 self.driver.refresh()
87 |             new_download_links = set(
88 |                 self._get_download_links()) - set(download_links)
89 |             if len(new_download_links) == 0: continue
90 |             if len(new_download_links) > 1:
91 |                 raise RuntimeError('More than one new archive found')
92 |             break
93 |         new_download_link = list(new_download_links)[0]
94 |         logger.info('Downloading archive')
95 |         google_login.login(self, new_download_link)
96 |         (_, data), = self.wait_and_return(self.get_downloaded_file)
97 |         return zipfile.ZipFile(io.BytesIO(data))
98 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
  1 | [style]
  2 | # Align closing bracket with visual indentation.
  3 | ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT=True
  4 | 
  5 | # Allow lambdas to be formatted on more than one line.
  6 | ALLOW_MULTILINE_LAMBDAS=True
  7 | 
  8 | # Insert a blank line before a 'def' or 'class' immediately nested
  9 | # within another 'def' or 'class'. For example:
 10 | # 
 11 | #   class Foo:
 12 | #                      # <------ this blank line
 13 | #     def method():
 14 | #       ...
 15 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF=False
 16 | 
 17 | # The column limit.
 18 | COLUMN_LIMIT=80
 19 | 
 20 | # Indent width used for line continuations.
 21 | CONTINUATION_INDENT_WIDTH=4
 22 | 
 23 | # Put closing brackets on a separate line, dedented, if the bracketed
 24 | # expression can't fit in a single line. Applies to all kinds of brackets,
 25 | # including function definitions and calls. For example:
 26 | # 
 27 | #   config = {
 28 | #       'key1': 'value1',
 29 | #       'key2': 'value2',
 30 | #   }        # <--- this bracket is dedented and on a separate line
 31 | # 
 32 | #   time_series = self.remote_client.query_entity_counters(
 33 | #       entity='dev3246.region1',
 34 | #       key='dns.query_latency_tcp',
 35 | #       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
 36 | #       start_ts=now()-timedelta(days=3),
 37 | #       end_ts=now(),
 38 | #   )        # <--- this bracket is dedented and on a separate line
 39 | DEDENT_CLOSING_BRACKETS=False
 40 | 
 41 | # The regex for an i18n comment. The presence of this comment stops
 42 | # reformatting of that line, because the comments are required to be
 43 | # next to the string they translate.
 44 | I18N_COMMENT=
 45 | 
 46 | # The i18n function call names. The presence of this function stops
 47 | # reformattting on that line, because the string it has cannot be moved
 48 | # away from the i18n comment.
 49 | I18N_FUNCTION_CALL=
 50 | 
 51 | # Indent the dictionary value if it cannot fit on the same line as the
 52 | # dictionary key. For example:
 53 | # 
 54 | #   config = {
 55 | #       'key1':
 56 | #           'value1',
 57 | #       'key2': value1 +
 58 | #               value2,
 59 | #   }
 60 | INDENT_DICTIONARY_VALUE=False
 61 | 
 62 | # The number of columns to use for indentation.
 63 | INDENT_WIDTH=4
 64 | 
 65 | # Join short lines into one line. E.g., single line 'if' statements.
 66 | JOIN_MULTIPLE_LINES=True
 67 | 
 68 | # Use spaces around the power operator.
 69 | SPACES_AROUND_POWER_OPERATOR=False
 70 | 
 71 | # The number of spaces required before a trailing comment.
 72 | SPACES_BEFORE_COMMENT=2
 73 | 
 74 | # Insert a space between the ending comma and closing bracket of a list,
 75 | # etc.
 76 | SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET=True
 77 | 
 78 | # Split before arguments if the argument list is terminated by a
 79 | # comma.
 80 | SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED=True
 81 | 
 82 | # Set to True to prefer splitting before '&', '|' or '^' rather than
 83 | # after.
 84 | SPLIT_BEFORE_BITWISE_OPERATOR=False
 85 | 
 86 | # If an argument / parameter list is going to be split, then split before
 87 | # the first argument.
 88 | SPLIT_BEFORE_FIRST_ARGUMENT=False
 89 | 
 90 | # Set to True to prefer splitting before 'and' or 'or' rather than
 91 | # after.
 92 | SPLIT_BEFORE_LOGICAL_OPERATOR=False
 93 | 
 94 | # Split named assignments onto individual lines.
 95 | SPLIT_BEFORE_NAMED_ASSIGNS=True
 96 | 
 97 | # The penalty for splitting right after the opening bracket.
 98 | SPLIT_PENALTY_AFTER_OPENING_BRACKET=30
 99 | 
100 | # The penalty for splitting the line after a unary operator.
101 | SPLIT_PENALTY_AFTER_UNARY_OPERATOR=10000
102 | 
103 | # The penalty for splitting right before an if expression.
104 | SPLIT_PENALTY_BEFORE_IF_EXPR=0
105 | 
106 | # The penalty of splitting the line around the '&', '|', and '^'
107 | # operators.
108 | SPLIT_PENALTY_BITWISE_OPERATOR=300
109 | 
110 | # The penalty for characters over the column limit.
111 | SPLIT_PENALTY_EXCESS_CHARACTER=2600
112 | 
113 | # The penalty incurred by adding a line split to the unwrapped line. The
114 | # more line splits added the higher the penalty.
115 | SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT=30
116 | 
117 | # The penalty of splitting a list of "import as" names. For example:
118 | # 
119 | #   from a_very_long_or_indented_module_name_yada_yad import (long_argument_1,
120 | #                                                             long_argument_2,
121 | #                                                             long_argument_3)
122 | # 
123 | # would reformat to something like:
124 | # 
125 | #   from a_very_long_or_indented_module_name_yada_yad import (
126 | #       long_argument_1, long_argument_2, long_argument_3)
127 | SPLIT_PENALTY_IMPORT_NAMES=0
128 | 
129 | # The penalty of splitting the line around the 'and' and 'or'
130 | # operators.
131 | SPLIT_PENALTY_LOGICAL_OPERATOR=300
132 | 
133 | # Use the Tab character for indentation.
134 | USE_TABS=False
135 | 


--------------------------------------------------------------------------------
/finance_dl/ebmud.py:
--------------------------------------------------------------------------------
  1 | """Retrieves East Bay Municipal Utility District (EBMUD) PDF water bills.
  2 | 
  3 | These PDF bills can be parsed by extracting the text using `pdftotext`.
  4 | 
  5 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  6 | scrape the Stockplanconnect website.
  7 | 
  8 | Configuration:
  9 | ==============
 10 | 
 11 | The following keys may be specified as part of the configuration dict:
 12 | 
 13 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 14 |   keys.
 15 | 
 16 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 17 |   local filesystem where the bills will be saved.  If the directory does not
 18 |   exist, it will be created.
 19 | 
 20 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 21 |   path to a persistent Chrome browser profile to use.  This should be a path
 22 |   used solely for this single configuration; it should not refer to your normal
 23 |   browser profile.  If not specified, a fresh temporary profile will be used
 24 |   each time.
 25 | 
 26 | Output format:
 27 | ==============
 28 | 
 29 | Each statement is saved to the `output_directory` with a name like:
 30 | 
 31 |     2017-11-28.bill.pdf
 32 | 
 33 | The date corresponds to the "Bill Date" of the bill.
 34 | 
 35 | Example:
 36 | ========
 37 | 
 38 |     def CONFIG_ebmud():
 39 |         return dict(
 40 |             module='finance_dl.ebmud',
 41 |             credentials={
 42 |                 'username': 'XXXXXX',
 43 |                 'password': 'XXXXXX',
 44 |             },
 45 |             output_directory=os.path.join(data_dir, 'ebmud'),
 46 |         )
 47 | 
 48 | 
 49 | Interactive shell:
 50 | ==================
 51 | 
 52 | From the interactive shell, type: `self.run()` to start the scraper.
 53 | 
 54 | """
 55 | 
 56 | import re
 57 | import logging
 58 | import os
 59 | 
 60 | import urllib.parse
 61 | import dateutil.parser
 62 | from selenium.webdriver.common.by import By
 63 | from selenium.webdriver.support.ui import Select
 64 | from selenium.webdriver.common.keys import Keys
 65 | 
 66 | from . import scrape_lib
 67 | 
 68 | logger = logging.getLogger('ebmud_scrape')
 69 | 
 70 | netloc_re = r'^([^\.@]+\.)*ebmud.com$'
 71 | 
 72 | 
 73 | def check_url(url):
 74 |     result = urllib.parse.urlparse(url)
 75 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 76 |         raise RuntimeError('Reached invalid URL: %r' % url)
 77 | 
 78 | 
 79 | class Scraper(scrape_lib.Scraper):
 80 |     def __init__(self, credentials, output_directory, **kwargs):
 81 |         super().__init__(**kwargs)
 82 |         self.credentials = credentials
 83 |         self.output_directory = output_directory
 84 |         self.logged_in = False
 85 | 
 86 |     def check_after_wait(self):
 87 |         check_url(self.driver.current_url)
 88 | 
 89 |     def login(self):
 90 |         if self.logged_in:
 91 |             return
 92 |         logger.info('Initiating log in')
 93 |         self.driver.get(
 94 |             'https://www.ebmud.com/customers/account/manage-your-account')
 95 | 
 96 |         (username, password), = self.wait_and_return(
 97 |             self.find_username_and_password_in_any_frame)
 98 |         logger.info('Entering username and password')
 99 |         username.send_keys(self.credentials['username'])
100 |         password.send_keys(self.credentials['password'])
101 |         with self.wait_for_page_load():
102 |             password.send_keys(Keys.ENTER)
103 |         logger.info('Logged in')
104 |         self.logged_in = True
105 | 
106 |     def get_statements(self):
107 |         logger.info('Looking for statement link')
108 |         statements_link, = self.wait_and_locate((By.LINK_TEXT,
109 |                                                  'View Statements'))
110 |         statements_link.click()
111 | 
112 |         (statements_table, ), = self.wait_and_return(
113 |             lambda: self.find_visible_elements_by_descendant_partial_text('Statement Date', 'table')
114 |         )
115 |         rows = statements_table.find_elements_by_xpath('tbody/tr/td')
116 |         for row in rows:
117 |             row_text_parts = row.text.split()
118 |             assert len(row_text_parts) == 4
119 |             statement_date = dateutil.parser.parse(row_text_parts[0]).date()
120 |             output_date_format = '%Y-%m-%d'
121 |             statement_path = os.path.join(
122 |                 self.output_directory, '%s.bill.pdf' %
123 |                 (statement_date.strftime(output_date_format), ))
124 |             if os.path.exists(statement_path):
125 |                 logger.info('Skipping existing statement: %s', statement_path)
126 |                 continue
127 |             logger.info('Downloading %s', statement_path)
128 |             self.click(row)
129 |             download_result, = self.wait_and_return(self.get_downloaded_file)
130 |             tmp_path = statement_path + '.tmp'
131 |             with open(tmp_path, 'wb') as f:
132 |                 f.write(download_result[1])
133 |             os.rename(tmp_path, statement_path)
134 |             logger.info('Wrote %s', statement_path)
135 | 
136 |     def run(self):
137 |         self.login()
138 |         self.get_statements()
139 | 
140 | 
141 | def run(**kwargs):
142 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
143 | 
144 | 
145 | def interactive(**kwargs):
146 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
147 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Python package for scraping personal financial data from financial
  2 | institutions.
  3 | 
  4 | [![License: GPL v2](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](LICENSE)
  5 | [![Build Status](https://travis-ci.com/jbms/finance-dl.svg?branch=master)](https://travis-ci.com/jbms/finance-dl)
  6 | 
  7 | This package may be useful on its own, but is specifically designed to be
  8 | used with
  9 | [beancount-import](https://github.com/jbms/beancount-import).
 10 | 
 11 | Supported data sources
 12 | ==
 13 | 
 14 | - [finance_dl.ofx](finance_dl/ofx.py): uses
 15 |   [ofxclient](https://github.com/captin411/ofxclient) to download data
 16 |   using the OFX protocol.
 17 | - [finance_dl.mint](finance_dl/mint.py): uses
 18 |   [mintapi](https://github.com/mrooney/mintapi) to download data from
 19 |   the Mint.com website.
 20 | - [finance_dl.venmo](finance_dl/venmo.py): downloads transaction and
 21 |   balance information from the Venmo.com website
 22 | - [finance_dl.paypal](finance_dl/paypal.py): downloads transactions
 23 |   from the Paypal.com website
 24 | - [finance_dl.amazon](finance_dl/amazon.py): downloads order invoices
 25 |   from the Amazon website
 26 | - [finance_dl.healthequity](finance_dl/healthequity.py): downloads
 27 |   transaction history and balance information from the HealthEquity
 28 |   website.
 29 | - [finance_dl.google_purchases](finance_dl/google_purchases.py):
 30 |   downloads purchases that Google has heuristically extracted from
 31 |   Gmail messages.
 32 | - [finance_dl.stockplanconnect](finance_dl/stockplanconnect.py):
 33 |   downloads PDF documents (including release and trade confirmations)
 34 |   from the Morgan Stanley Stockplanconnect website.
 35 | - [finance_dl.pge](finance_dl/pge.py): downloads Pacific Gas &
 36 |   Electric (PG&E) PDF bills.
 37 | - [finance_dl.comcast](finance_dl/comcast.py): downloads Comcast PDF
 38 |   bills.
 39 | - [finance_dl.ebmud](finance_dl/ebmud.py): downloads East Bay
 40 |   Municipal Utility District (EBMUD) water bills.
 41 | - [finance_dl.anthem](finance_dl/anthem.py): downloads Anthem
 42 |   BlueCross insurance claim statements.
 43 | - [finance_dl.waveapps](finance_dl/waveapps.py): downloads receipt
 44 |   images and extracted transaction data from
 45 |   [Wave](https://waveapps.com), which is a free receipt-scanning
 46 |   website/mobile app.
 47 | - [finance_dl.ultipro_google](finance_dl/ultipro_google.py): downloads
 48 |   Google employee payroll statements in PDF format from Ultipro.
 49 | 
 50 | Setup
 51 | ==
 52 | 
 53 | To install the most recent published package from PyPi, simply type:
 54 | 
 55 | ```shell
 56 | pip install finance-dl
 57 | ```
 58 | 
 59 | To install from a clone of the repository, type:
 60 | 
 61 | ```shell
 62 | pip install .
 63 | ```
 64 | 
 65 | or for development:
 66 | 
 67 | ```shell
 68 | pip install -e .
 69 | ```
 70 | 
 71 | Configuration
 72 | ==
 73 | 
 74 | Create a Python file like `example_finance_dl_config.py`.
 75 | 
 76 | Refer to the documentation of the individual scraper modules for
 77 | details.
 78 | 
 79 | Basic Usage
 80 | ==
 81 | 
 82 | You can run a scraping configuration named `myconfig` as follows:
 83 | 
 84 |     python -m finance_dl.cli --config-module example_finance_dl_config --config myconfig
 85 | 
 86 | The configuration `myconfig` refers to a function named
 87 | `CONFIG_myconfig` in the configuration module.
 88 | 
 89 | Make sure that your configuration module is accessible in your Python
 90 | `sys.path`.  Since `sys.path` includes the current directory by
 91 | default, you can simply run this command from the directory that
 92 | contains your configuration module.
 93 | 
 94 | By default, the scrapers run fully automatically, and the ones based
 95 | on `selenium` and `chromedriver` run in headless mode.  If the initial
 96 | attempt for a `selenium`-based scraper fails, it is automatically
 97 | retried again with the browser window visible.  This allows you to
 98 | manually complete the login process and enter any multi-factor
 99 | authentication code that is required.
100 | 
101 | To debug a scraper, you can run it in interactive mode by specifying
102 | the `-i` command-line argument.  This runs an interactive IPython
103 | shell that lets you manually invoke parts of the scraping process.
104 | 
105 | Automatic Usage
106 | ==
107 | 
108 | To run multiple configurations at once, and keep track of when each
109 | configuration was last updated, you can use the `finance_dl.update`
110 | tool.
111 | 
112 | To display the update status, first create a `logs` directory and run:
113 | 
114 |     python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs status
115 | 
116 | Initially, this will indicate that none of the configurations have
117 | been updated.  To update a single configuration `myconfig`, run:
118 | 
119 |     python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update myconfig
120 | 
121 | With a single configuration specified, this does the same thing as the
122 | `finance_dl.cli` tool, except that the log messages are written to
123 | `logs/myconfig.txt` and a `logs/myconfig.lastupdate` file is created
124 | if it is successful.
125 | 
126 | If multiple configurations are specified, as in:
127 | 
128 |     python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update myconfig1 myconfig2
129 | 
130 | then all specified configurations are run in parallel.
131 | 
132 | To update all configurations, run:
133 | 
134 |     python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update --all
135 | 
136 | License
137 | ==
138 | 
139 | Copyright (C) 2014-2018 Jeremy Maitin-Shepard.
140 | 
141 | Distributed under the GNU General Public License, Version 2.0 only.
142 | See [LICENSE](LICENSE) file for details.
143 | 


--------------------------------------------------------------------------------
/finance_dl/google_purchases.py:
--------------------------------------------------------------------------------
  1 | """Retrieves purchase and reservation history from Google.
  2 | 
  3 | This contains purchases that have been heuristically extracted from Gmail
  4 | messages, and possibly other sources.
  5 | 
  6 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  7 | scrape the Google Takeout and Google purchases/reservations websites.
  8 | 
  9 | Configuration:
 10 | ==============
 11 | 
 12 | The following keys may be specified as part of the configuration dict:
 13 | 
 14 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 15 |   keys.
 16 | 
 17 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 18 |   local filesystem where the output will be written.  If the directory does not
 19 |   exist, it will be created.
 20 | 
 21 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 22 |   path to a persistent Chrome browser profile to use.  This should be a path
 23 |   used solely for this single configuration; it should not refer to your normal
 24 |   browser profile.  If not specified, a fresh temporary profile will be used
 25 |   each time.
 26 | 
 27 | Output format:
 28 | ==============
 29 | 
 30 | For each purchase, two files are written to the specified `output_directory`:
 31 | `<id>.html` contains the raw HTML content of the order details page, and
 32 | `order_<id>.json` is a JSON file in the Google Takeout Purchases/Reservations
 33 | format.
 34 | 
 35 | Example:
 36 | ========
 37 | 
 38 |     def CONFIG_google_purchases():
 39 |         return dict(
 40 |             module='finance_dl.google_purchases',
 41 |             credentials={
 42 |                 'username': 'XXXXXX',
 43 |                 'password': 'XXXXXX',
 44 |             },
 45 |             output_directory=os.path.join(data_dir, 'google_purchases'),
 46 |             # profile_dir is optional.
 47 |             profile_dir=os.path.join(profile_dir, 'google_purchases'),
 48 |         )
 49 | 
 50 | Interactive shell:
 51 | ==================
 52 | 
 53 | From the interactive shell, type: `self.run()` to start the scraper.
 54 | 
 55 | """
 56 | 
 57 | from typing import List, Any, Tuple
 58 | import urllib.parse
 59 | import re
 60 | import json
 61 | import logging
 62 | import os
 63 | from selenium.webdriver.common.by import By
 64 | from selenium.webdriver.support.ui import Select
 65 | from selenium.webdriver.common.keys import Keys
 66 | from selenium.common.exceptions import NoSuchElementException
 67 | import jsonschema
 68 | from atomicwrites import atomic_write
 69 | from . import scrape_lib
 70 | from . import google_login
 71 | from . import google_takeout
 72 | 
 73 | logger = logging.getLogger('google_purchases')
 74 | 
 75 | netloc_re = r'^([^\.@]+\.)*google.com$'
 76 | 
 77 | 
 78 | def check_url(url):
 79 |     result = urllib.parse.urlparse(url)
 80 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 81 |         raise RuntimeError('Reached invalid URL: %r' % url)
 82 | 
 83 | class Scraper(google_takeout.Scraper):
 84 |     def __init__(self, output_directory: str, **kwargs):
 85 |         super().__init__(**kwargs)
 86 |         self.output_directory = output_directory
 87 | 
 88 |     def check_after_wait(self):
 89 |         check_url(self.driver.current_url)
 90 | 
 91 |     def extract_raw_data(self):
 92 |         source = self.driver.page_source
 93 |         prefix = 'data:function(){return '
 94 |         start_index = source.index(prefix) + len(prefix)
 95 |         source_suffix = source[start_index:]
 96 |         try:
 97 |             value = json.loads(source_suffix)
 98 |             raise ValueError('Expected error parsing JSON')
 99 |         except json.JSONDecodeError as e:
100 |             encoded_json = source_suffix[:e.pos]
101 |             value = json.loads(encoded_json)
102 |         return value
103 | 
104 |     def _fetch_html_pages(self, need_to_fetch: List[Tuple[str, str]]):
105 |         logger.info('Fetching details for %d purchases', len(need_to_fetch))
106 |         for i, (purchase_id, html_path) in enumerate(need_to_fetch):
107 |             url = 'https://myaccount.google.com/purchases/detail?order_id=' + purchase_id
108 |             logger.info('Fetching details %d/%d: %s', i, len(need_to_fetch), url)
109 |             with self.wait_for_page_load():
110 |                 self.driver.get(url)
111 |             content = self.driver.page_source
112 |             with atomic_write(
113 |                     html_path, mode='w', encoding='utf-8', newline='\n') as f:
114 |                 # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
115 |                 f.write('\ufeff' + content)
116 |             logger.info('Write details %d/%d: %s', i, len(need_to_fetch), html_path)
117 | 
118 |     def run(self):
119 |         if not os.path.exists(self.output_directory):
120 |             os.makedirs(self.output_directory)
121 | 
122 |         self.download_data()
123 | 
124 |     def download_data(self):
125 |         takeout_zip = self.get_takeout_zipfile(['my_orders'])
126 |         need_to_fetch = []
127 |         for name in takeout_zip.namelist():
128 |             m = re.match(r'.*/order_([0-9]+)\.json$', name)
129 |             if m is None:
130 |                 logger.info('Ignoring file in takeout archive: %s', name)
131 |                 continue
132 |             order_id = m.group(1)
133 |             json_path = os.path.join(self.output_directory,
134 |                                      'order_' + order_id + '.json')
135 |             if not os.path.exists(json_path):
136 |                 with atomic_write(json_path, mode='wb') as f:
137 |                     f.write(takeout_zip.read(name))
138 |             html_path = os.path.join(self.output_directory, order_id + '.html')
139 |             if os.path.exists(html_path):
140 |                 continue
141 |             need_to_fetch.append((order_id, html_path))
142 |         self._fetch_html_pages(need_to_fetch)
143 | 
144 | 
145 | def run(**kwargs):
146 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
147 | 
148 | 
149 | def interactive(**kwargs):
150 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
151 | 


--------------------------------------------------------------------------------
/example_finance_dl_config.py:
--------------------------------------------------------------------------------
  1 | """Example configuration file for finance_dl.
  2 | 
  3 | Configuration entries are defined by defining a top-level function with a name
  4 | beginning with `CONFIG_`.  The portion after the `CONFIG_` prefix is the name
  5 | of the configuration.
  6 | 
  7 | Rather than hard code your usernames and passwords into this configuration
  8 | file, you may instead wish to write some code to retrieve them from some
  9 | external password store.
 10 | """
 11 | 
 12 | import os
 13 | 
 14 | # Directory for persistent browser profiles.
 15 | profile_dir = os.path.join(os.getenv('HOME'), '.cache', 'finance_dl')
 16 | data_dir = '/path/where/data/will/be/saved'
 17 | 
 18 | 
 19 | def CONFIG_vanguard():
 20 |     # To determine the correct values for `id`, `org`, and `url` for your
 21 |     # financial institution, search on https://www.ofxhome.com/
 22 |     ofx_params = {
 23 |         'id': '15103',
 24 |         'org': 'Vanguard',
 25 |         'url': 'https://vesnc.vanguard.com/us/OfxDirectConnectServlet',
 26 |         'username': 'XXXXXX',
 27 |         'password': 'XXXXXX',
 28 |     }
 29 |     return dict(
 30 |         module='finance_dl.ofx',
 31 |         ofx_params=ofx_params,
 32 |         output_directory=os.path.join(data_dir, 'vanguard'),
 33 |     )
 34 | 
 35 | 
 36 | def CONFIG_amazon():
 37 |     return dict(
 38 |         module='finance_dl.amazon',
 39 |         credentials={
 40 |             'username': 'XXXXXX',
 41 |             'password': 'XXXXXX',
 42 |         },
 43 |         output_directory=os.path.join(data_dir, 'amazon'),
 44 |         # profile_dir is optional.
 45 |         profile_dir=os.path.join(profile_dir, 'amazon'),
 46 |     )
 47 | 
 48 | 
 49 | def CONFIG_mint():
 50 |     return dict(
 51 |         module='finance_dl.mint',
 52 |         credentials={
 53 |             'username': 'XXXXXX',
 54 |             'password': 'XXXXXX',
 55 |         },
 56 |         output_directory=os.path.join(data_dir, 'mint'),
 57 |         # profile_dir is optional, but highly recommended to avoid having to
 58 |         # enter multi-factor authentication code each time.
 59 |         profile_dir=os.path.join(profile_dir, 'mint'),
 60 |     )
 61 | 
 62 | 
 63 | def CONFIG_healthequity():
 64 |     return dict(
 65 |         module='finance_dl.healthequity',
 66 |         credentials={
 67 |             'username': 'XXXXXX',
 68 |             'password': 'XXXXXX',
 69 |         },
 70 |         # Use your HealthEquity account number as the last directory component.
 71 |         output_directory=os.path.join(data_dir, 'healthequity', '1234567'),
 72 | 
 73 |         # profile_dir is optional but highly recommended to avoid having to
 74 |         # enter multi-factor authentication code each time.
 75 |         profile_dir=os.path.join(profile_dir, 'healthequity'),
 76 |     )
 77 | 
 78 | 
 79 | def CONFIG_venmo():
 80 |     return dict(
 81 |         module='finance_dl.venmo',
 82 |         credentials={
 83 |             'username': 'XXXXXX',
 84 |             'password': 'XXXXXX',
 85 |         },
 86 |         output_directory=os.path.join(data_dir, 'venmo'),
 87 | 
 88 |         # profile_dir is optional but highly recommended to avoid having to
 89 |         # enter multi-factor authentication code each time.
 90 |         profile_dir=os.path.join(profile_dir, 'venmo'),
 91 |     )
 92 | 
 93 | 
 94 | def CONFIG_paypal():
 95 |     return dict(
 96 |         module='finance_dl.paypal',
 97 |         credentials={
 98 |             'username': 'XXXXXX',
 99 |             'password': 'XXXXXX',
100 |         },
101 |         output_directory=os.path.join(data_dir, 'paypal'),
102 |     )
103 | 
104 | 
105 | def CONFIG_google_purchases():
106 |     return dict(
107 |         module='finance_dl.google_purchases',
108 |         credentials={
109 |             'username': 'XXXXXX',
110 |             'password': 'XXXXXX',
111 |         },
112 |         output_directory=os.path.join(data_dir, 'google_purchases'),
113 |     )
114 | 
115 | 
116 | def CONFIG_stockplanconnect():
117 |     return dict(
118 |         module='finance_dl.stockplanconnect',
119 |         credentials={
120 |             'username': 'XXXXXX',
121 |             'password': 'XXXXXX',
122 |         },
123 |         output_directory=os.path.join(data_dir, 'stockplanconnect'),
124 |         headless=False,
125 |     )
126 | 
127 | 
128 | def CONFIG_pge():
129 |     return dict(
130 |         module='finance_dl.pge',
131 |         credentials={
132 |             'username': 'XXXXXX',
133 |             'password': 'XXXXXX',
134 |         },
135 |         output_directory=os.path.join(data_dir, 'pge'),
136 |     )
137 | 
138 | 
139 | def CONFIG_comcast():
140 |     return dict(
141 |         module='finance_dl.comcast',
142 |         credentials={
143 |             'username': 'XXXXXX',
144 |             'password': 'XXXXXX',
145 |         },
146 |         output_directory=os.path.join(data_dir, 'comcast'),
147 |     )
148 | 
149 | 
150 | def CONFIG_ebmud():
151 |     return dict(
152 |         module='finance_dl.ebmud',
153 |         credentials={
154 |             'username': 'XXXXXX',
155 |             'password': 'XXXXXX',
156 |         },
157 |         output_directory=os.path.join(data_dir, 'ebmud'),
158 |     )
159 | 
160 | 
161 | def CONFIG_anthem():
162 |     return dict(
163 |         module='finance_dl.anthem',
164 |         login_url='https://anthem.com',
165 |         output_directory=os.path.join(data_dir, 'anthem'),
166 |         profile_dir=os.path.join(profile_dir, 'anthem'),
167 |         headless=False,
168 |     )
169 | 
170 | 
171 | def CONFIG_waveapps():
172 |     return dict(
173 |         module='finance_dl.waveapps',
174 |         credentials=dict(
175 |             token='XXXXXXXX',
176 |         ),
177 |         output_directory=os.path.join(data_dir, 'waveapps'),
178 |     )
179 | 
180 | 
181 | def CONFIG_google_payroll():
182 |     return dict(
183 |         module='finance_dl.ultipro_google',
184 |         credentials={
185 |             'username': 'XXXXXX',
186 |             'password': 'XXXXXX',
187 |         },
188 |         output_directory=os.path.join(data_dir, 'documents', 'Income',
189 |                                       'Google'),
190 | 
191 |         # profile_dir is optional but recommended.
192 |         profile_dir=os.path.join(profile_dir, 'google_payroll'),
193 | 
194 |         # Recommended for greater reliability.
195 |         headless=False,
196 |     )
197 | 


--------------------------------------------------------------------------------
/finance_dl/anthem.py:
--------------------------------------------------------------------------------
  1 | """Retrieves Anthem BlueCross Explanation of Benefits (EOB) statements.
  2 | 
  3 | Due to automation countermeasures implemented by Anthem, this module is only
  4 | semi-automatic: the user must manually login and navigate to the claims page.
  5 | 
  6 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  7 | scrape the Anthem website.
  8 | 
  9 | Configuration:
 10 | ==============
 11 | 
 12 | The following keys may be specified as part of the configuration dict:
 13 | 
 14 | - `login_url`: Required.  Must be a `str` that specifies the initial URL at
 15 |   which to start.  The user is responsible for manually logging in and
 16 |   navigating to the claims page.
 17 | 
 18 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 19 |   local filesystem where the output will be written.  If the directory does not
 20 |   exist, it will be created.
 21 | 
 22 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 23 |   path to a persistent Chrome browser profile to use.  This should be a path
 24 |   used solely for this single configuration; it should not refer to your normal
 25 |   browser profile.  If not specified, a fresh temporary profile will be used
 26 |   each time.
 27 | 
 28 | - `headless`: Must be set to `False`, since this scraper requires manual input.
 29 | 
 30 | Example:
 31 | ========
 32 | 
 33 |     def CONFIG_anthem():
 34 |         return dict(
 35 |             module='finance_dl.anthem',
 36 |             login_url='https://anthem.com',
 37 |             output_directory=os.path.join(data_dir, 'anthem'),
 38 | 
 39 |             # profile_dir is optional but recommended.
 40 |             profile_dir=os.path.join(profile_dir, 'anthem'),
 41 | 
 42 |             # headless must be `False` since manual intervention is required
 43 |             headless=False,
 44 |         )
 45 | 
 46 | Output format:
 47 | ==============
 48 | 
 49 | For each claim, two files are written to the specified `output_directory`:
 50 | `<id>.json` contains a JSON representation of the claim as returned by the
 51 | Anthem server, and `<id>.pdf` contains the PDF "Explanation of Benefits"
 52 | statement for the claim.
 53 | 
 54 | The JSON file contains output of the form:
 55 | 
 56 |     {
 57 |       "patient": {
 58 |         "displayName": "John Smith",
 59 |         "uniqueId": "123456789",
 60 |         "allowsAccess": true
 61 |       },
 62 |       "provider": "SOME MEDICAL PROVIDER",
 63 |       "totalCharges": 385,
 64 |       "serviceDate": "01/02/2017 00:00:00",
 65 |       "memberResponsibility": 111.05,
 66 |       "status": "Approved",
 67 |       "appliedToDeductible": 111.05,
 68 |       "claimNumber": "2017123AB1234"
 69 |     }
 70 | 
 71 | 
 72 | 
 73 | Interactive shell:
 74 | ==================
 75 | 
 76 | From the interactive shell, type: `self.run()` to start the scraper.
 77 | 
 78 | """
 79 | 
 80 | from typing import List, Any
 81 | import urllib.parse
 82 | import re
 83 | import collections
 84 | import json
 85 | import logging
 86 | import datetime
 87 | import os
 88 | from selenium.webdriver.common.by import By
 89 | from selenium.webdriver.support.ui import Select
 90 | from selenium.webdriver.common.keys import Keys
 91 | from selenium.common.exceptions import NoSuchElementException
 92 | import bs4
 93 | import jsonschema
 94 | from atomicwrites import atomic_write
 95 | 
 96 | from . import scrape_lib
 97 | from . import google_login
 98 | 
 99 | logger = logging.getLogger('anthem')
100 | 
101 | netloc_re = r'^([^\.@]+\.)*anthem.com$'
102 | 
103 | 
104 | class Scraper(scrape_lib.Scraper):
105 |     def __init__(self, login_url: str, output_directory: str, **kwargs):
106 |         super().__init__(use_seleniumrequests=True, **kwargs)
107 |         self.login_url = login_url
108 |         self.output_directory = output_directory
109 | 
110 |     def login(self):
111 |         self.driver.get(self.login_url)
112 | 
113 |     def maybe_get_claims_json(self):
114 |         try:
115 |             soup = bs4.BeautifulSoup(self.driver.page_source, 'html.parser')
116 |             return json.loads(
117 |                 soup.find(id='claimsJson').text,
118 |                 object_pairs_hook=collections.OrderedDict)
119 |         except:
120 |             raise NoSuchElementException
121 | 
122 |     def wait_for_claims_json(self):
123 |         logger.info('Please login and navigate to the claims page')
124 |         result = self.wait_and_return(self.maybe_get_claims_json,
125 |                                       timeout=500)[0]
126 |         logger.info('Claims data found')
127 |         return result
128 | 
129 |     def save_documents(self):
130 |         if not os.path.exists(self.output_directory):
131 |             os.makedirs(self.output_directory)
132 |         claims_json = self.wait_for_claims_json()
133 |         downloads_needed = []
134 |         for claim in claims_json['claims']:
135 |             url = claim['eobLinkUrl']
136 |             pdf_path = os.path.join(self.output_directory,
137 |                                     claim['claimNumber'] + '.pdf')
138 |             json_path = os.path.join(self.output_directory,
139 |                                      claim['claimNumber'] + '.json')
140 |             if not os.path.exists(json_path):
141 |                 with atomic_write(
142 |                         json_path, mode='w', encoding='utf-8',
143 |                         newline='\n') as f:
144 |                     f.write(json.dumps(claim, indent='  ').strip() + '\n')
145 |             if not os.path.exists(pdf_path):
146 |                 if not claim['eobLinkUrl'].startswith('https:/'): continue
147 |                 downloads_needed.append((claim['eobLinkUrl'], pdf_path))
148 |         for i, (url, pdf_path) in enumerate(downloads_needed):
149 |             logger.info('Downloading EOB %d/%d', i + 1, len(downloads_needed))
150 |             self.driver.get(url)
151 |             download_result, = self.wait_and_return(self.get_downloaded_file)
152 |             with atomic_write(pdf_path, mode='wb') as f:
153 |                 f.write(download_result[1])
154 | 
155 |     def run(self):
156 |         self.login()
157 |         self.save_documents()
158 | 
159 | 
160 | def run(**kwargs):
161 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
162 | 
163 | 
164 | def interactive(**kwargs):
165 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
166 | 


--------------------------------------------------------------------------------
/finance_dl/pge.py:
--------------------------------------------------------------------------------
  1 | """Retrieves Pacific Gas and Electric (PG&E) PDF bills.
  2 | 
  3 | These PDF bills can be parsed by extracting the text using `pdftotext`.
  4 | 
  5 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  6 | scrape the Stockplanconnect website.
  7 | 
  8 | Configuration:
  9 | ==============
 10 | 
 11 | The following keys may be specified as part of the configuration dict:
 12 | 
 13 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 14 |   keys.
 15 | 
 16 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 17 |   local filesystem where the bills will be saved.  If the directory does not
 18 |   exist, it will be created.
 19 | 
 20 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 21 |   path to a persistent Chrome browser profile to use.  This should be a path
 22 |   used solely for this single configuration; it should not refer to your normal
 23 |   browser profile.  If not specified, a fresh temporary profile will be used
 24 |   each time.
 25 | 
 26 | Output format:
 27 | ==============
 28 | 
 29 | Each statement is saved to the `output_directory` with a name like:
 30 | 
 31 |     2017-11-28.bill.pdf
 32 | 
 33 | The date corresponds to the "Statement Date" of the bill.
 34 | 
 35 | Example:
 36 | ========
 37 | 
 38 |     def CONFIG_pge():
 39 |         return dict(
 40 |             module='finance_dl.pge',
 41 |             credentials={
 42 |                 'username': 'XXXXXX',
 43 |                 'password': 'XXXXXX',
 44 |             },
 45 |             output_directory=os.path.join(data_dir, 'pge'),
 46 |         )
 47 | 
 48 | 
 49 | Interactive shell:
 50 | ==================
 51 | 
 52 | From the interactive shell, type: `self.run()` to start the scraper.
 53 | 
 54 | """
 55 | 
 56 | import re
 57 | import datetime
 58 | import logging
 59 | import os
 60 | import urllib.parse
 61 | 
 62 | from selenium.webdriver.common.by import By
 63 | from selenium.webdriver.support.ui import Select
 64 | from selenium.webdriver.common.keys import Keys
 65 | 
 66 | from . import scrape_lib
 67 | 
 68 | logger = logging.getLogger('pge_scrape')
 69 | 
 70 | netloc_re = r'^([^\.@]+\.)*pge.com$'
 71 | 
 72 | 
 73 | def check_url(url):
 74 |     result = urllib.parse.urlparse(url)
 75 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 76 |         raise RuntimeError('Reached invalid URL: %r' % url)
 77 | 
 78 | 
 79 | def find_first_matching_date(lines, date_format):
 80 |     for line in lines:
 81 |         try:
 82 |             return datetime.datetime.strptime(line, date_format).date()
 83 |         except:
 84 |             pass
 85 |     return None
 86 | 
 87 | 
 88 | class Scraper(scrape_lib.Scraper):
 89 |     def __init__(self, credentials, output_directory, **kwargs):
 90 |         super().__init__(**kwargs)
 91 |         self.credentials = credentials
 92 |         self.output_directory = output_directory
 93 |         self.logged_in = False
 94 | 
 95 |     def check_after_wait(self):
 96 |         check_url(self.driver.current_url)
 97 | 
 98 |     def login(self):
 99 |         if self.logged_in:
100 |             return
101 |         logger.info('Initiating log in')
102 |         self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page')
103 | 
104 |         (username, password), = self.wait_and_return(
105 |             self.find_username_and_password_in_any_frame)
106 |         logger.info('Entering username and password')
107 |         username.send_keys(self.credentials['username'])
108 |         password.send_keys(self.credentials['password'])
109 |         with self.wait_for_page_load():
110 |             password.send_keys(Keys.ENTER)
111 |         logger.info('Logged in')
112 |         self.logged_in = True
113 | 
114 |     def get_output_path(self, output_dir, date):
115 |         journal_date_format = '%Y-%m-%d'
116 |         return os.path.join(
117 |             output_dir, '%s.bill.pdf' % (date.strftime(journal_date_format)))
118 | 
119 |     def process_download(self, download_result, output_dir):
120 |         logger.info('Got download: %s' % download_result[0])
121 |         m = re.fullmatch(r'.*custbill([0-9]{2})([0-9]{2})([0-9]{4})\.pdf',
122 |                          download_result[0])
123 |         if not m:
124 |             logger.error('Failed to determine date from downloaded file: %s' %
125 |                          download_result[0])
126 |             return True
127 |         else:
128 |             date = datetime.date(
129 |                 year=int(m.group(3)), month=int(m.group(1)), day=int(
130 |                     m.group(2)))
131 |             new_path = self.get_output_path(output_dir, date)
132 |             if os.path.exists(new_path):
133 |                 logger.info('Skipping duplicate download: %s', date)
134 |                 return False
135 |             tmp_path = new_path + '.tmp'
136 |             with open(tmp_path, 'wb') as f:
137 |                 download_data = download_result[1]
138 |                 f.write(download_data)
139 |             os.rename(tmp_path, new_path)
140 |             logger.info("Wrote %s", new_path)
141 |             return True
142 | 
143 |     def get_bills(self, output_dir):
144 |         logger.info('Looking for download link')
145 |         (bills_link, ), = self.wait_and_return(
146 |             lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2'))
147 |         scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2)
148 |         links, = self.wait_and_return(
149 |             lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF")
150 |         )
151 | 
152 |         def do_download(link):
153 |             scrape_lib.retry(lambda: self.click(link), retry_delay=2)
154 |             logger.info('Waiting for download')
155 |             download_result, = self.wait_and_return(self.get_downloaded_file)
156 |             return self.process_download(download_result, output_dir)
157 | 
158 |         for link in links:
159 |             if not do_download(link):
160 |                 break
161 | 
162 |     def run(self):
163 |         self.login()
164 |         if not os.path.exists(self.output_directory):
165 |             os.makedirs(self.output_directory)
166 |         self.get_bills(self.output_directory)
167 | 
168 | 
169 | def run(**kwargs):
170 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
171 | 
172 | 
173 | def interactive(**kwargs):
174 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
175 | 


--------------------------------------------------------------------------------
/finance_dl/comcast.py:
--------------------------------------------------------------------------------
  1 | """Retrieves Comcast PDF bills.
  2 | 
  3 | These PDF bills can be parsed by extracting the text using `pdftotext`.
  4 | 
  5 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  6 | scrape the Stockplanconnect website.
  7 | 
  8 | Configuration:
  9 | ==============
 10 | 
 11 | The following keys may be specified as part of the configuration dict:
 12 | 
 13 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 14 |   keys.
 15 | 
 16 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 17 |   local filesystem where the bills will be saved.  If the directory does not
 18 |   exist, it will be created.
 19 | 
 20 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 21 |   path to a persistent Chrome browser profile to use.  This should be a path
 22 |   used solely for this single configuration; it should not refer to your normal
 23 |   browser profile.  If not specified, a fresh temporary profile will be used
 24 |   each time.
 25 | 
 26 | Output format:
 27 | ==============
 28 | 
 29 | Each statement is saved to the `output_directory` with a name like:
 30 | 
 31 |     2017-11-28.bill.pdf
 32 | 
 33 | The date corresponds to the "Bill Date" of the bill.
 34 | 
 35 | Example:
 36 | ========
 37 | 
 38 |     def CONFIG_comcast():
 39 |         return dict(
 40 |             module='finance_dl.comcast',
 41 |             credentials={
 42 |                 'username': 'XXXXXX',
 43 |                 'password': 'XXXXXX',
 44 |             },
 45 |             output_directory=os.path.join(data_dir, 'comcast'),
 46 |         )
 47 | 
 48 | 
 49 | Interactive shell:
 50 | ==================
 51 | 
 52 | From the interactive shell, type: `self.run()` to start the scraper.
 53 | 
 54 | """
 55 | 
 56 | import re
 57 | import datetime
 58 | import time
 59 | import logging
 60 | import os
 61 | import urllib.parse
 62 | 
 63 | import dateutil.parser
 64 | from selenium.webdriver.common.by import By
 65 | from selenium.webdriver.support.ui import Select
 66 | from selenium.webdriver.common.keys import Keys
 67 | 
 68 | from . import scrape_lib
 69 | 
 70 | logger = logging.getLogger('comcast_scrape')
 71 | 
 72 | netloc_re = r'^([^\.@]+\.)*(comcast.com|xfinity.com|comcast.net)$'
 73 | 
 74 | 
 75 | def check_url(url):
 76 |     result = urllib.parse.urlparse(url)
 77 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 78 |         raise RuntimeError('Reached invalid URL: %r' % url)
 79 | 
 80 | 
 81 | def find_first_matching_date(lines, date_format):
 82 |     for line in lines:
 83 |         try:
 84 |             return datetime.datetime.strptime(line, date_format).date()
 85 |         except:
 86 |             pass
 87 |     return None
 88 | 
 89 | 
 90 | class Scraper(scrape_lib.Scraper):
 91 |     def __init__(self, credentials, output_directory, **kwargs):
 92 |         super().__init__(**kwargs)
 93 |         self.credentials = credentials
 94 |         self.output_directory = output_directory
 95 |         self.logged_in = False
 96 | 
 97 |     def check_after_wait(self):
 98 |         check_url(self.driver.current_url)
 99 | 
100 |     def login(self):
101 |         if self.logged_in:
102 |             return
103 |         logger.info('Initiating log in')
104 |         self.driver.get('https://customer.xfinity.com/Secure/MyAccount/')
105 | 
106 |         (username, password), = self.wait_and_return(
107 |             self.find_username_and_password_in_any_frame)
108 |         logger.info('Entering username and password')
109 |         username.send_keys(self.credentials['username'])
110 |         password.send_keys(self.credentials['password'])
111 |         with self.wait_for_page_load():
112 |             password.send_keys(Keys.ENTER)
113 |         logger.info('Logged in')
114 |         self.logged_in = True
115 | 
116 |     def get_output_path(self, output_dir, date):
117 |         journal_date_format = '%Y-%m-%d'
118 |         return os.path.join(
119 |             output_dir, '%s.bill.pdf' % (date.strftime(journal_date_format)))
120 | 
121 |     def process_download(self, download_result, output_dir, date):
122 |         logger.info('Got download: %s' % download_result[0])
123 |         new_path = self.get_output_path(output_dir, date)
124 |         if os.path.exists(new_path):
125 |             logger.info('Skipping duplicate download: %s', new_path)
126 |             return
127 |         tmp_path = new_path + '.tmp'
128 |         with open(tmp_path, 'wb') as f:
129 |             f.write(download_result[1])
130 |         os.rename(tmp_path, new_path)
131 |         logger.info("Wrote %s" % new_path)
132 | 
133 |     def get_bills(self, output_dir):
134 |         logger.info('Looking for bills link')
135 | 
136 |         def get_bills_link():
137 |             (bills_link, ), = self.wait_and_return(
138 |                 lambda: self.find_visible_elements_by_descendant_partial_text('View Bill History', 'span'))
139 |             return bills_link
140 | 
141 |         bills_link = get_bills_link()
142 | 
143 |         for partial_text in ['Check it out', 'Continue To My Account']:
144 |             try:
145 |                 continue_link, = self.find_visible_elements_by_descendant_partial_text(
146 |                     partial_text, 'button')
147 |                 continue_link.click()
148 |                 time.sleep(3.0)  # wait for overlay to go away
149 |             except:
150 |                 pass
151 |         bills_link = get_bills_link()
152 | 
153 |         self.driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE)
154 |         bills_link.click()
155 | 
156 |         def get_links():
157 |             links, = self.wait_and_return(
158 |                 lambda: self.driver.find_elements(By.XPATH, '//a[starts-with(text(), "View PDF")]'))
159 |             return links
160 | 
161 |         links = get_links()
162 |         time.sleep(5.0)
163 |         links = get_links()
164 | 
165 |         for link in links:
166 |             if not link.is_displayed():
167 |                 continue
168 |             cur_el = link
169 |             bill_date = None
170 |             while True:
171 |                 parent = cur_el.find_element_by_xpath('..')
172 |                 if parent == cur_el:
173 |                     break
174 |                 try:
175 |                     bill_date = dateutil.parser.parse(parent.text, fuzzy=True)
176 |                     break
177 |                 except:
178 |                     cur_el = parent
179 |                     continue
180 |             if bill_date is None:
181 |                 print('skipping link due to no bill date')
182 |                 continue
183 |             bill_date = bill_date + datetime.timedelta(days=1)
184 |             new_path = self.get_output_path(output_dir, bill_date)
185 |             if os.path.exists(new_path):
186 |                 logger.info(
187 |                     "Skipping already-downloaded bill for %s" % bill_date)
188 |             else:
189 |                 logger.info('Attempting download of bill for %s' % bill_date)
190 |                 link.click()
191 |                 logger.info('Waiting for download')
192 |                 download_result, = self.wait_and_return(
193 |                     self.get_downloaded_file)
194 |                 self.process_download(download_result, output_dir, bill_date)
195 | 
196 |     def run(self):
197 |         self.login()
198 |         if not os.path.exists(self.output_directory):
199 |             os.makedirs(self.output_directory)
200 |         self.get_bills(self.output_directory)
201 | 
202 | 
203 | def run(**kwargs):
204 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
205 | 
206 | 
207 | def interactive(**kwargs):
208 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
209 | 


--------------------------------------------------------------------------------
/finance_dl/update.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import argparse
  3 | import importlib
  4 | import subprocess
  5 | import concurrent.futures
  6 | import sys
  7 | import threading
  8 | import os
  9 | import time
 10 | 
 11 | config_prefix = 'CONFIG_'
 12 | 
 13 | 
 14 | def _format_duration(count) -> str:
 15 |     seconds_per_day = 24 * 60 * 60
 16 |     if count >= seconds_per_day:
 17 |         return '%d days' % (count // seconds_per_day)
 18 |     return '%d minutes' % (count // 60)
 19 | 
 20 | 
 21 | class CommandBase:
 22 |     def __init__(self, args):
 23 |         self.args = args
 24 |         self.config_module = importlib.import_module(args.config_module)
 25 |         self.log_dir = args.log_dir
 26 | 
 27 |     def get_all_configs(self) -> List[str]:
 28 |         names = []
 29 |         for key in vars(self.config_module):
 30 |             if key.startswith(config_prefix):
 31 |                 names.append(key[len(config_prefix):])
 32 |         return names
 33 | 
 34 |     def get_last_update_path(self, config_name: str) -> str:
 35 |         return os.path.join(self.log_dir, config_name + '.lastupdate')
 36 | 
 37 |     def get_log_path(self, config_name: str) -> str:
 38 |         return os.path.join(self.log_dir, config_name + '.txt')
 39 | 
 40 |     def get_last_update_time(self, config_name: str) -> Optional[float]:
 41 |         try:
 42 |             statinfo = os.stat(self.get_last_update_path(config_name))
 43 |             return statinfo.st_mtime
 44 |         except OSError:
 45 |             return None
 46 | 
 47 | 
 48 | class StatusCommand(CommandBase):
 49 |     def __init__(self, args):
 50 |         super().__init__(args)
 51 | 
 52 |     def __call__(self):
 53 |         cur_time = time.time()
 54 |         config_names = self.get_all_configs()
 55 |         max_name_len = max(len(x) for x in config_names)
 56 |         update_times = [(name, self.get_last_update_time(name))
 57 |                         for name in config_names]
 58 | 
 59 |         def get_time_sort_key(mtime: Optional[int]) -> float:
 60 |             if mtime is None:
 61 |                 return float('-inf')
 62 |             return mtime
 63 | 
 64 |         update_times.sort(key=lambda x: get_time_sort_key(x[1]))
 65 |         for name, mtime in update_times:
 66 |             if mtime is not None:
 67 |                 update_string = '%s (%s ago)' % (time.strftime(
 68 |                     '%c',
 69 |                     time.localtime(mtime)), _format_duration(cur_time - mtime))
 70 |             else:
 71 |                 update_string = 'NEVER'
 72 |             print('%*s: %s' % (max_name_len, name, update_string))
 73 | 
 74 | 
 75 | class Updater(CommandBase):
 76 |     def __init__(self, args):
 77 |         super().__init__(args)
 78 |         force = self.args.force
 79 |         cur_time = time.time()
 80 |         configs = self.args.config
 81 |         if self.args.all:
 82 |             configs = self.get_all_configs()
 83 |         configs_to_update = []
 84 |         for config in configs:
 85 |             mtime = self.get_last_update_time(config)
 86 |             if not force and mtime is not None and (
 87 |                     cur_time - mtime) < 24 * 60 * 60:
 88 |                 print('%s: SKIPPING (updated %s ago)' %
 89 |                       (config, _format_duration(cur_time - mtime)))
 90 |                 continue
 91 |             configs_to_update.append(config)
 92 |         self.configs_to_update = configs_to_update
 93 |         self._lock = threading.Lock()
 94 |         self.configs_completed = 0
 95 | 
 96 |     def print_message(self, config, start_time, message, completed=False):
 97 |         with self._lock:
 98 |             if completed:
 99 |                 self.configs_completed += 1
100 |             print('[%d/%d] %s [%.fs elapsed] %s' %
101 |                   (self.configs_completed, len(self.configs_to_update), config,
102 |                    time.time() - start_time, message.rstrip()))
103 | 
104 |     def run_config(self, config):
105 |         start_time = time.time()
106 |         self.print_message(config, start_time, 'starting')
107 |         success = False
108 |         termination_message = 'SUCCESS'
109 |         try:
110 |             with open(
111 |                     self.get_log_path(config), 'w', encoding='utf-8',
112 |                     newline='') as f:
113 |                 process = subprocess.Popen(
114 |                     [
115 |                         sys.executable, '-m', 'finance_dl.cli',
116 |                         '--config-module', self.args.config_module, '-c', config
117 |                     ],
118 |                     stdout=subprocess.PIPE,
119 |                     stderr=subprocess.STDOUT,
120 |                     bufsize=1,
121 |                     universal_newlines=True,
122 |                 )
123 |                 for line in process.stdout:
124 |                     self.print_message(config, start_time, line.rstrip())
125 |                     f.write(line)
126 |                 process.wait()
127 |                 if process.returncode == 0:
128 |                     success = True
129 |                     with open(
130 |                             self.get_last_update_path(config),
131 |                             'w',
132 |                             encoding='utf-8',
133 |                             newline='') as f:
134 |                         pass
135 |                 else:
136 |                     termination_message = 'FAILED with return code %d' % (process.returncode)
137 | 
138 |         except:
139 |             termination_message = 'FAILED with exception'
140 |         self.print_message(config, start_time, termination_message,
141 |                            completed=True)
142 | 
143 |     def __call__(self):
144 |         with concurrent.futures.ThreadPoolExecutor(
145 |                 max_workers=self.args.parallelism) as executor:
146 |             for config in self.configs_to_update:
147 |                 executor.submit(self.run_config, config)
148 | 
149 | 
150 | def main():
151 |     ap = argparse.ArgumentParser(
152 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
153 |     ap.add_argument('--config-module', type=str, required=True,
154 |                     help='Python module defining CONFIG_<name> functions.')
155 |     ap.add_argument('--log-dir', type=str, required=True,
156 |                     help='Directory containing log files.')
157 | 
158 |     subparsers = ap.add_subparsers(dest='command')
159 |     subparsers.required = True
160 | 
161 |     ap_status = subparsers.add_parser(
162 |         'status',
163 |         help='Show update status.',
164 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
165 |     )
166 |     ap_status.set_defaults(command_class=StatusCommand)
167 | 
168 |     ap_update = subparsers.add_parser(
169 |         'update',
170 |         help='Update configurations.',
171 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
172 |     )
173 |     ap_update.add_argument('config', nargs='*', type=str, default=[],
174 |                            help='Configuration to update')
175 |     ap_update.add_argument(
176 |         '-f', '--force', action='store_true',
177 |         help='Force update even if the configuration has already run recently.'
178 |     )
179 |     ap_update.add_argument('-a', '--all', action='store_true',
180 |                            help='Update all configurations.')
181 |     ap_update.add_argument(
182 |         '-p', '--parallelism', type=int, default=4,
183 |         help='Maximum number of configurations to update in parallel.')
184 |     ap_update.set_defaults(command_class=Updater)
185 | 
186 |     args = ap.parse_args()
187 | 
188 |     command = args.command_class(args)
189 |     command()
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     main()
194 | 


--------------------------------------------------------------------------------
/finance_dl/stockplanconnect.py:
--------------------------------------------------------------------------------
  1 | """Retrieves PDF documents from https://www.stockplanconnect.com.
  2 | 
  3 | These PDF documents can be parsed by extracting the text using `pdftotext`.
  4 | 
  5 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  6 | scrape the Stockplanconnect website.
  7 | 
  8 | Configuration:
  9 | ==============
 10 | 
 11 | The following keys may be specified as part of the configuration dict:
 12 | 
 13 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 14 |   keys.
 15 | 
 16 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 17 |   local filesystem where the documents will be saved.  If the directory does not
 18 |   exist, it will be created.
 19 | 
 20 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 21 |   path to a persistent Chrome browser profile to use.  This should be a path
 22 |   used solely for this single configuration; it should not refer to your normal
 23 |   browser profile.  If not specified, a fresh temporary profile will be used
 24 |   each time.
 25 | 
 26 | - `headless`: Must be set to `False` currently, as this scraper does not work
 27 |   properly when run with a headless browser.
 28 | 
 29 | Output format:
 30 | ==============
 31 | 
 32 | Each document is saved to the `output_directory` with a name like:
 33 | 
 34 |     2017-02-09.Restricted_Units.Trade_Confirmations.Confirmation.pdf
 35 |     2017-08-30.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf
 36 |     2017-12-31.Other.Tax_Documents.Form_1099.pdf
 37 | 
 38 | If there are multiple documents of the same type on the same date, a number is
 39 | appended, e.g.:
 40 | 
 41 |     2018-05-31.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf
 42 |     2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.2.pdf
 43 |     2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.3.pdf
 44 | 
 45 | If for some reason this data source does not work and you wish to manually
 46 | download documents, make sure to use the same name numbering scheme: the first
 47 | document listed with a given date, document type, and name should be given no
 48 | numeric suffix, the second such document should be given a suffix of `.2`, the
 49 | third `.3`, etc.
 50 | 
 51 | Example:
 52 | ========
 53 | 
 54 |     def CONFIG_stockplanconnect():
 55 |         return dict(
 56 |             module='finance_dl.stockplanconnect',
 57 |             credentials={
 58 |                 'username': 'XXXXXX',
 59 |                 'password': 'XXXXXX',
 60 |             },
 61 |             output_directory=os.path.join(data_dir, 'stockplanconnect'),
 62 |             headless=False,
 63 |         )
 64 | 
 65 | Interactive shell:
 66 | ==================
 67 | 
 68 | From the interactive shell, type: `self.run()` to start the scraper.
 69 | 
 70 | """
 71 | 
 72 | import urllib.parse
 73 | import re
 74 | import collections
 75 | import time
 76 | import logging
 77 | import os
 78 | 
 79 | import dateutil.parser
 80 | from selenium.webdriver.common.by import By
 81 | from selenium.webdriver.support.ui import Select
 82 | from selenium.webdriver.common.keys import Keys
 83 | 
 84 | from finance_dl import scrape_lib
 85 | 
 86 | logger = logging.getLogger('scraper')
 87 | 
 88 | netloc_re = r'^([^\.@]+\.)*stockplanconnect.com|([^\.@]+\.)*morganstanley.com$'
 89 | 
 90 | 
 91 | def check_url(url):
 92 |     result = urllib.parse.urlparse(url)
 93 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 94 |         raise RuntimeError('Reached invalid URL: %r' % url)
 95 | 
 96 | 
 97 | class Scraper(scrape_lib.Scraper):
 98 |     def __init__(self, credentials, output_directory, **kwargs):
 99 |         super().__init__(**kwargs)
100 |         self.credentials = credentials
101 |         self.output_directory = output_directory
102 | 
103 |     def check_after_wait(self):
104 |         check_url(self.driver.current_url)
105 | 
106 |     def login(self):
107 |         logger.info('Initiating log in')
108 |         self.driver.get('https://www.stockplanconnect.com')
109 |         (username, password), = self.wait_and_return(
110 |             self.find_username_and_password_in_any_frame)
111 |         time.sleep(2.0)
112 |         username.click()
113 |         time.sleep(2.0)
114 |         logger.info('Entering username')
115 |         username.send_keys(self.credentials['username'])
116 |         username.click()
117 |         time.sleep(2.0)
118 |         logger.info('Entering password')
119 |         password.click()
120 |         time.sleep(1.0)
121 |         password.send_keys(self.credentials['password'])
122 |         time.sleep(1.0)
123 |         with self.wait_for_page_load():
124 |             password.send_keys(Keys.ENTER)
125 |         logger.info('Logged in')
126 | 
127 |     def get_output_path(self, parts, index):
128 |         journal_date_format = '%Y-%m-%d'
129 |         date = dateutil.parser.parse(parts[0])
130 | 
131 |         def sanitize(x):
132 |             x = x.replace(' ', '_')
133 |             x = re.sub('[^a-zA-Z0-9-_.]', '', x)
134 |             return x
135 | 
136 |         suffix = ''
137 |         if index != 1:
138 |             suffix = '.%d' % index
139 | 
140 |         return os.path.join(
141 |             self.output_directory,
142 |             '%s.%s.%s.%s%s.pdf' % (date.strftime(journal_date_format),
143 |                                    sanitize(parts[1]), sanitize(parts[2]),
144 |                                    sanitize(parts[3]), suffix))
145 | 
146 |     def get_documents(self):
147 |         logger.info('Looking for documents link')
148 |         documents, = self.wait_and_locate((By.PARTIAL_LINK_TEXT, 'Documents'))
149 |         scrape_lib.retry(lambda: self.click(documents), num_tries=3,
150 |                          retry_delay=5)
151 |         self.download_documents()
152 | 
153 |     def download_documents(self):
154 |         logger.info('Looking for PDF links')
155 |         links, = self.wait_and_return(
156 |             lambda: self.driver.find_elements(By.LINK_TEXT, 'PDF'))
157 |         links = list(links)[::-1]
158 |         previously_seen_parts = collections.Counter()
159 |         for link in links:
160 |             cur_el = link
161 |             output_path = None
162 |             while True:
163 |                 try:
164 |                     parent = cur_el.find_element_by_xpath('..')
165 |                 except:
166 |                     break
167 |                 if parent == cur_el:
168 |                     break
169 |                 full_text = parent.text
170 |                 parts = full_text.split('\n')
171 |                 if len(parts) == 5:
172 |                     try:
173 |                         key = tuple(parts)
174 |                         index = previously_seen_parts[key] + 1
175 |                         previously_seen_parts[key] += 1
176 |                         output_path = self.get_output_path(parts, index)
177 |                         break
178 |                     except:
179 |                         logger.info('Failed to determine output filename %r',
180 |                                     parts)
181 |                         break
182 |                 else:
183 |                     cur_el = parent
184 |             if output_path is None:
185 |                 logger.info('skipping link due to no date')
186 |                 continue
187 |             if os.path.exists(output_path):
188 |                 logger.info('skipping existing file: %r', output_path)
189 |                 continue
190 | 
191 |             self.click(link)
192 |             logger.info('Waiting for download')
193 |             download_result, = self.wait_and_return(self.get_downloaded_file)
194 | 
195 |             if not os.path.exists(self.output_directory):
196 |                 os.makedirs(self.output_directory)
197 | 
198 |             tmp_path = output_path + '.tmp'
199 |             with open(tmp_path, 'wb') as f:
200 |                 download_data = download_result[1]
201 |                 f.write(download_data)
202 |             os.rename(tmp_path, output_path)
203 |             logger.info("Wrote %s", output_path)
204 | 
205 |     def run(self):
206 |         self.login()
207 |         self.get_documents()
208 | 
209 | 
210 | def run(**kwargs):
211 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
212 | 
213 | 
214 | def interactive(**kwargs):
215 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
216 | 


--------------------------------------------------------------------------------
/finance_dl/waveapps.py:
--------------------------------------------------------------------------------
  1 | """Retrieves receipt images and extracted data from waveapps.com.
  2 | 
  3 | This uses the waveapps API (https://docs.waveapps.io/) to retrieve the data
  4 | directly.
  5 | 
  6 | 
  7 | Configuration:
  8 | ==============
  9 | 
 10 | The following keys may be specified as part of the configuration dict:
 11 | 
 12 | - `credentials`: Required.  Must be a `dict` with a `'token'` key specifying a
 13 |   Full Access token.  To generate a token, first sign in to https://waveapps.com
 14 |   and then visit the "Manage Applications" page:
 15 |   https://developer.waveapps.com/hc/en-us/articles/360019762711
 16 | 
 17 |   Choose "Create an application", then after creating an application choose
 18 |   "Create token".
 19 | 
 20 |   Alternatively, if you have a valid OAuth2 client id, instead of the `'token'`
 21 |   field you may specify `'client_id'`, `'username'`, and `'password'` fields.
 22 |   Signing in with a Google account is not supported.
 23 | 
 24 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 25 |   local filesystem where the output will be written.  If the directory does not
 26 |   exist, it will be created.
 27 | 
 28 | - `use_business_directory`: Optional.  If specified, must be a `bool`. If `True`,
 29 |   create a subdirectory in `output_directory` to write the output for each
 30 |   business ID.
 31 | 
 32 | - `active_only`: Optional.  If specified, must be a `bool`. If `True`, do not
 33 |   download deleted receipts.
 34 | 
 35 | Output format:
 36 | ==============
 37 | 
 38 | This module downloads receipts for all businesses that are accessible using the
 39 | specified `credentials`. The receipts for each business is stored in the
 40 | sub-directory of the specified `output_directory` with a name equal to the
 41 | business name. If the sub-directory does not exist, it will be created.
 42 | 
 43 | Within each business sub-directory, for each receipt, the JSON data as returned
 44 | by the API is saved as `<receipt-id>.json`.  The JSON data contains at least the
 45 | following fields:
 46 | 
 47 |  - `id`: The unique receipt identifier, matching the `<receipt-id>` portion of
 48 |    the filename.
 49 | 
 50 | - `date`: The date.
 51 | 
 52 | - `merchant`: Merchant name
 53 | 
 54 | - `note`: Optional note.
 55 | 
 56 | - `total`: Total amount.
 57 | 
 58 | - `currency_code`: The currency code.
 59 | 
 60 | The corresponding receipt images are saved in full resolution as:
 61 | `<receipt-id>.jpeg`, and if there are additional images, as
 62 | `<receipt-id>.01.jpeg`, `<receipt-id>.02.jpeg`, etc.
 63 | 
 64 | Example:
 65 | ========
 66 | 
 67 |     def CONFIG_waveapps():
 68 |         return dict(
 69 |             module='finance_dl.waveapps',
 70 |             credentials={
 71 |                 'token': 'XXXXXX',
 72 |             },
 73 |             output_directory=os.path.join(data_dir, 'waveapps'),
 74 |         )
 75 | 
 76 | """
 77 | 
 78 | from typing import List, Any
 79 | import contextlib
 80 | import logging
 81 | import json
 82 | import os
 83 | 
 84 | import requests
 85 | from atomicwrites import atomic_write
 86 | 
 87 | logger = logging.getLogger('waveapps')
 88 | 
 89 | 
 90 | class WaveScraper(object):
 91 |     def __init__(self, credentials: dict, output_directory: str,
 92 |                  use_business_directory: bool = False,
 93 |                  active_only: bool = False, headless=None):
 94 |         del headless
 95 |         self.credentials = credentials
 96 |         self.output_directory = output_directory
 97 |         self.use_business_directory = use_business_directory
 98 |         self.active_only = active_only
 99 | 
100 |     def get_oauth2_token(self):
101 |         if 'token' in self.credentials:
102 |             logger.info('Using specified token')
103 |             self._oauth_token = {
104 |                 'token_type': 'Bearer',
105 |                 'access_token': self.credentials['token']
106 |             }
107 |         else:
108 |             logger.info('Obtaining oauth2 token')
109 |             oauth_url = 'https://api.waveapps.com/oauth2/token/'
110 |             response = requests.post(
111 |                 oauth_url, files={
112 |                     k: (None, v, None, {})
113 |                     for k, v in [
114 |                         ('client_id', self.credentials['client_id']),
115 |                         ('username', self.credentials['username']),
116 |                         ('grant_type', 'password'),
117 |                         ('password', self.credentials['password']),
118 |                     ]
119 |                 })
120 |             response.raise_for_status()
121 |             self._oauth_token = response.json()
122 |         self._authenticated_headers = {
123 |             'authorization':
124 |             self._oauth_token['token_type'] + ' ' +
125 |             self._oauth_token['access_token'],
126 |         }
127 | 
128 |     def get_businesses(self):
129 |         logger.info('Getting list of businesses')
130 |         response = requests.get(
131 |             'https://api.waveapps.com/businesses/?include_personal=true',
132 |             headers=dict(self._authenticated_headers,
133 |                          accept='application/json'),
134 |         )
135 |         response.raise_for_status()
136 |         result = response.json()
137 |         logger.info('Got %d businesses', len(result))
138 |         return result
139 | 
140 |     def get_receipts(self, business_id: str):
141 |         logger.info('Getting receipts for business %s', business_id)
142 |         receipts = []  # type: List[Any]
143 |         response = requests.get(
144 |             'https://api.waveapps.com/businesses/' + business_id +
145 |             '/receipts/?active_only=' +
146 |             (self.active_only and 'true' or 'false'),
147 |             headers=dict(self._authenticated_headers,
148 |                          accept='application/json'),
149 |         )
150 |         response.raise_for_status()
151 |         result = response.json()
152 |         cur_list = result['results']
153 |         logger.info('Received %d receipts', len(cur_list))
154 |         receipts.extend(cur_list)
155 |         return receipts
156 | 
157 |     def save_receipts(self, receipts: List[Any], output_directory: str = None):
158 |         if not output_directory:
159 |             output_directory = self.output_directory
160 |         if not os.path.exists(output_directory):
161 |             os.makedirs(output_directory)
162 |         for receipt in receipts:
163 |             output_prefix = os.path.join(output_directory,
164 |                                          str(receipt['id']))
165 |             json_path = output_prefix + '.json'
166 |             for image_i, image in enumerate(receipt['images']):
167 |                 image_url = image['file']
168 |                 if image_i == 0:
169 |                     image_path = '%s.jpg' % (output_prefix, )
170 |                 else:
171 |                     image_path = '%s.%02d.jpg' % (output_prefix, image_i)
172 |                 if not os.path.exists(image_path):
173 |                     logger.info('Downloading receipt image %s', image_url)
174 |                     r = requests.get(image_url)
175 |                     r.raise_for_status()
176 |                     data = r.content
177 |                     with atomic_write(image_path, mode='wb') as f:
178 |                         f.write(data)
179 |             with atomic_write(
180 |                     json_path,
181 |                     mode='w',
182 |                     overwrite=True,
183 |                     encoding='utf-8',
184 |                     newline='\n') as f:
185 |                 json.dump(receipt, f, sort_keys=True, indent='  ')
186 | 
187 |     def run(self):
188 |         self.get_oauth2_token()
189 |         output_directory = self.output_directory
190 |         businesses = self.get_businesses()
191 |         for business in businesses:
192 |             business_id = business['id']
193 |             receipts = self.get_receipts(business_id)
194 |             if receipts and self.use_business_directory:
195 |                 output_directory = os.path.join(self.output_directory,
196 |                                                 business_id)
197 |             self.save_receipts(receipts, output_directory)
198 | 
199 | 
200 | def run(**kwargs):
201 |     scraper = WaveScraper(**kwargs)
202 |     scraper.run()
203 | 
204 | 
205 | @contextlib.contextmanager
206 | def interactive(**kwargs):
207 |     scraper = WaveScraper(**kwargs)
208 |     kwargs['scraper'] = scraper
209 |     yield kwargs
210 | 


--------------------------------------------------------------------------------
/finance_dl/ultipro_google.py:
--------------------------------------------------------------------------------
  1 | """Retrieves Google employee payroll statements from Ultipro in PDF format.
  2 | 
  3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  4 | scrape the Ultipro website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the PDF pay statements will be written.  If the
 16 |   directory does not exist, it will be created.
 17 | 
 18 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 19 |   path to a persistent Chrome browser profile to use.  This should be a path
 20 |   used solely for this single configuration; it should not refer to your normal
 21 |   browser profile.  If not specified, a fresh temporary profile will be used
 22 |   each time.
 23 | 
 24 | - `headless`: Optional.  If specified, must be a `bool`.  Defaults to `True`.
 25 |   Indicates whether to use a headless browser.  Scraping appears to be more
 26 |   reliable when this is set to `True`.
 27 | 
 28 | Output format:
 29 | ==============
 30 | 
 31 | Each pay statement is downloaded in PDF format and saved to the
 32 | `output_directory` with a filename of `%Y-%m-%d.statement-<id>.pdf`, where
 33 | `<id>` is the document number in the "Pay History" list.  In some cases, due to
 34 | a bug of some sort, the document number in the "Pay History" list may differ
 35 | from the document number included in the actual document.  Such discrepancies
 36 | are handled by the `beancount_import.source.ultipro_google` module.
 37 | 
 38 | Example:
 39 | ========
 40 | 
 41 |     def CONFIG_google_payroll():
 42 |         return dict(
 43 |             module='finance_dl.ultipro_google',
 44 |             credentials={
 45 |                 'username': 'XXXXXX',
 46 |                 'password': 'XXXXXX',
 47 |             },
 48 |             output_directory=os.path.join(data_dir, 'documents', 'Income',
 49 |                                           'Google'),
 50 | 
 51 |             # profile_dir is optional but recommended.
 52 |             profile_dir=os.path.join(profile_dir, 'google_payroll'),
 53 | 
 54 |             # Recommended for greater reliability.
 55 |             headless=False,
 56 |         )
 57 | 
 58 | Interactive shell:
 59 | ==================
 60 | 
 61 | From the interactive shell, type: `self.run()` to start the scraper.
 62 | 
 63 | """
 64 | 
 65 | import datetime
 66 | import logging
 67 | import os
 68 | import re
 69 | import urllib.parse
 70 | from selenium.webdriver.common.by import By
 71 | from selenium.webdriver.support.ui import Select
 72 | from selenium.webdriver.common.keys import Keys
 73 | from atomicwrites import atomic_write
 74 | from . import scrape_lib, google_login
 75 | 
 76 | logger = logging.getLogger('ultipro')
 77 | 
 78 | output_date_format = '%Y-%m-%d'
 79 | 
 80 | 
 81 | class Scraper(scrape_lib.Scraper):
 82 |     def __init__(self,
 83 |                  credentials,
 84 |                  output_directory,
 85 |                  login_url='https://googlemypay.ultipro.com',
 86 |                  netloc_re=r'^([^\.@]+\.)*(ultipro.com|google.com)$',
 87 |                  **kwargs):
 88 |         super().__init__(**kwargs)
 89 |         self.credentials = credentials
 90 |         self.login_url = login_url
 91 |         self.netloc_re = netloc_re
 92 |         self.output_directory = output_directory
 93 | 
 94 |     def check_url(self, url):
 95 |         result = urllib.parse.urlparse(url)
 96 |         if result.scheme != 'https' or not re.fullmatch(self.netloc_re,
 97 |                                                         result.netloc):
 98 |             raise RuntimeError('Reached invalid URL: %r' % url)
 99 | 
100 |     def check_after_wait(self):
101 |         self.check_url(self.driver.current_url)
102 | 
103 |     def login(self):
104 |         google_login.login(self, self.login_url)
105 | 
106 |     def get_next_statement(self,
107 |                            existing_statements=set(),
108 |                            downloaded_statements=set()):
109 |         pay_history, = self.wait_and_return(
110 |             lambda: self.find_element_in_any_frame(
111 |                 By.PARTIAL_LINK_TEXT, "Pay History", only_displayed=True))
112 |         pay_history.click()
113 | 
114 |         def get_statement_table():
115 |             try:
116 |                 for table in self.find_elements_in_any_frame(
117 |                         By.TAG_NAME, 'table', only_displayed=True):
118 |                     headings = [
119 |                         x.text.strip()
120 |                         for x in table.find_elements_by_xpath('thead/tr/th')
121 |                     ]
122 |                     if 'Pay Date' in headings and 'Document Number' in headings:
123 |                         return table
124 |             except:
125 |                 import traceback
126 |                 traceback.print_exc()
127 | 
128 |         table, = self.wait_and_return(get_statement_table)
129 |         date_format = '%m/%d/%Y'
130 |         for row in table.find_elements_by_xpath('tbody/tr'):
131 |             row_text = [
132 |                 x.text.strip() for x in row.find_elements_by_tag_name('td')
133 |             ]
134 |             row_text = [x for x in row_text if x]
135 |             pay_date = row_text[0]
136 |             document_number = row_text[1]
137 |             assert re.fullmatch('[0-9A-Z]+', document_number), document_number
138 |             pay_date = datetime.datetime.strptime(pay_date, date_format).date()
139 |             document_str = 'Document %r : %r' % (pay_date, document_number)
140 |             if (pay_date, document_number) in existing_statements:
141 |                 logger.info('  Found in existing')
142 |                 continue
143 |             if (pay_date, document_number) not in downloaded_statements:
144 |                 logger.info('%s:  Downloading', document_str)
145 |                 link = row.find_element_by_tag_name('a')
146 |                 link.click()
147 |                 download_link, = self.wait_and_return(
148 |                     lambda: self.find_element_in_any_frame(
149 |                         By.XPATH,
150 |                         '//input[@type="image" and contains(@title, "download")]'
151 |                     ))
152 |                 download_link.click()
153 |                 logger.info('%s: Waiting to get download', document_str)
154 |                 download_result, = self.wait_and_return(
155 |                     self.get_downloaded_file)
156 |                 name, data = download_result
157 |                 if len(data) < 5000:
158 |                     raise RuntimeError(
159 |                         'Downloaded file size is invalid: %d' % len(data))
160 |                 output_name = '%s.statement-%s.pdf' % (
161 |                     pay_date.strftime('%Y-%m-%d'), document_number)
162 |                 output_path = os.path.join(self.output_directory, output_name)
163 |                 with atomic_write(output_path, mode='wb') as f:
164 |                     f.write(data)
165 |                 downloaded_statements.add((pay_date, document_number))
166 |                 return True
167 |             else:
168 |                 logger.info('%s: Just downloaded', document_str)
169 |         return False
170 | 
171 |     def get_existing_statements(self):
172 |         existing_statements = set()
173 |         if os.path.exists(self.output_directory):
174 |             for name in os.listdir(self.output_directory):
175 |                 m = re.fullmatch(
176 |                     r'([0-9]{4})-([0-9]{2})-([0-9]{2})\.statement-([0-9A-Z]+)\.pdf',
177 |                     name)
178 |                 if m is not None:
179 |                     date = datetime.date(
180 |                         year=int(m.group(1)),
181 |                         month=int(m.group(2)),
182 |                         day=int(m.group(3)))
183 |                     statement_number = m.group(4)
184 |                     existing_statements.add((date, statement_number))
185 |                     logger.info('Found existing statement %r %r', date,
186 |                                 statement_number)
187 |                 else:
188 |                     logger.warning(
189 |                         'Ignoring extraneous file in existing statement directory: %r',
190 |                         os.path.join(self.output_directory, name))
191 |         return existing_statements
192 | 
193 |     def download_statements(self):
194 |         if not os.path.exists(self.output_directory):
195 |             os.makedirs(self.output_directory)
196 |         existing_statements = self.get_existing_statements()
197 |         downloaded_statements = set()
198 |         while self.get_next_statement(
199 |                 existing_statements=existing_statements,
200 |                 downloaded_statements=downloaded_statements,
201 |         ):
202 |             pass
203 | 
204 |     def run(self):
205 |         self.login()
206 |         self.download_statements()
207 | 
208 | 
209 | def run(**kwargs):
210 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
211 | 
212 | 
213 | def interactive(**kwargs):
214 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
215 | 


--------------------------------------------------------------------------------
/finance_dl/paypal.py:
--------------------------------------------------------------------------------
  1 | """Retrieves Paypal activity from https://paypal.com.
  2 | 
  3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  4 | scrape the Google purchases website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the output will be written.  If the directory does not
 16 |   exist, it will be created.
 17 | 
 18 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 19 |   path to a persistent Chrome browser profile to use.  This should be a path
 20 |   used solely for this single configuration; it should not refer to your normal
 21 |   browser profile.  If not specified, a fresh temporary profile will be used
 22 |   each time.
 23 | 
 24 | Output format:
 25 | ==============
 26 | 
 27 | For each Paypal transaction, two files are written to the specified
 28 | `output_directory`: `<id>.json` contains a JSON representation of the
 29 | transaction as returned by the Paypal server, and `<id>.html` contains an HTML
 30 | representation.
 31 | 
 32 | For invoices, instead the files `<id>.pdf` and `<id>.invoice.json` are written
 33 | to the specified `output_directory`.
 34 | 
 35 | Interactive shell:
 36 | ==================
 37 | 
 38 | From the interactive shell, type: `self.run()` to start the scraper.
 39 | 
 40 | """
 41 | 
 42 | from typing import List, Any
 43 | import urllib.parse
 44 | import re
 45 | import json
 46 | import logging
 47 | import datetime
 48 | import os
 49 | from selenium.webdriver.common.by import By
 50 | from selenium.webdriver.support.ui import Select
 51 | from selenium.webdriver.common.keys import Keys
 52 | from selenium.common.exceptions import NoSuchElementException
 53 | import jsonschema
 54 | from atomicwrites import atomic_write
 55 | from . import scrape_lib
 56 | from . import google_login
 57 | 
 58 | logger = logging.getLogger('paypal')
 59 | 
 60 | netloc_re = r'^([^\.@]+\.)*paypal.com$'
 61 | 
 62 | transaction_list_schema = {
 63 |     '#schema': 'http://json-schema.org/draft-07/schema#',
 64 |     'description': 'JSON schema for the transaction list response.',
 65 |     'type': 'object',
 66 |     'required': ['data'],
 67 |     'properties': {
 68 |         'data': {
 69 |             'type': 'object',
 70 |             'required': ['activity'],
 71 |             'properties': {
 72 |                 'activity': {
 73 |                     'type': 'object',
 74 |                     'required': ['transactions'],
 75 |                     'properties': {
 76 |                         'transactions': {
 77 |                             'type': 'array',
 78 |                             'items': {
 79 |                                 'type': 'object',
 80 |                                 'required': ['id'],
 81 |                                 'properties': {
 82 |                                     'id': {
 83 |                                         'type': 'string',
 84 |                                         'pattern': r'^[A-Za-z0-9\-]+$',
 85 |                                     },
 86 |                                 },
 87 |                             }
 88 |                         },
 89 |                     },
 90 |                 },
 91 |             },
 92 |         },
 93 |     },
 94 | }
 95 | 
 96 | transaction_details_schema = {
 97 |     '#schema': 'http://json-schema.org/draft-07/schema#',
 98 |     'description': 'JSON schema for the transaction details response.',
 99 |     'type': 'object',
100 |     'required': ['data'],
101 |     'properties': {
102 |         'data': {
103 |             'type': 'object',
104 |             'required': ['details'],
105 |             'properties': {
106 |                 'details': {
107 |                     'type': 'object',
108 |                 },
109 |             },
110 |         },
111 |     },
112 | }
113 | 
114 | 
115 | def check_url(url):
116 |     result = urllib.parse.urlparse(url)
117 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
118 |         raise RuntimeError('Reached invalid URL: %r' % url)
119 | 
120 | 
121 | class Scraper(scrape_lib.Scraper):
122 |     def __init__(self, credentials: dict, output_directory: str, **kwargs):
123 |         super().__init__(use_seleniumrequests=True, **kwargs)
124 |         self.credentials = credentials
125 |         self.output_directory = output_directory
126 |         self.logged_in = False
127 | 
128 |     def check_after_wait(self):
129 |         check_url(self.driver.current_url)
130 | 
131 |     def login(self):
132 |         if self.logged_in:
133 |             return
134 | 
135 |         self.driver.get('https://www.paypal.com/us/signin')
136 |         logger.info('Finding username field')
137 |         username, = self.wait_and_locate((By.XPATH, '//input[@type="email"]'),
138 |                                          only_displayed=True)
139 |         logger.info('Entering username')
140 |         username.send_keys(self.credentials['username'])
141 |         username.send_keys(Keys.ENTER)
142 |         logger.info('Finding password field')
143 |         password, = self.wait_and_locate(
144 |             (By.XPATH, '//input[@type="password"]'), only_displayed=True)
145 |         logger.info('Entering password')
146 |         password.send_keys(self.credentials['password'])
147 |         with self.wait_for_page_load():
148 |             password.send_keys(Keys.ENTER)
149 |         logger.info('Logged in')
150 |         self.logged_in = True
151 |         self.csrf_token = None
152 | 
153 |     def make_json_request(self, url):
154 |         return self.driver.request(
155 |             'GET', url, headers={
156 |                 'x-csrf-token': self.get_csrf_token(),
157 |                 'accept': 'application/json, text/javascript, */*; q=0.01',
158 |                 'x-requested-with': 'XMLHttpRequest'
159 |             })
160 | 
161 |     def get_csrf_token(self):
162 |         if self.csrf_token is not None: return self.csrf_token
163 |         logging.info('Getting CSRF token')
164 |         self.driver.get('https://www.paypal.com/myaccount/transactions/')
165 |         # Get CSRF token
166 |         body_element, = self.wait_and_locate((By.XPATH,
167 |                                               '//body[@data-token!=""]'))
168 |         self.csrf_token = body_element.get_attribute('data-token')
169 |         return self.csrf_token
170 | 
171 |     def get_transaction_list(self):
172 |         end_date = datetime.datetime.now().date() + datetime.timedelta(days=2)
173 |         start_date = end_date - datetime.timedelta(days=365 * 10)
174 |         date_format = '%Y-%m-%d'
175 |         logging.info('Getting transaction list')
176 |         url = (
177 |             'https://www.paypal.com/myaccount/transactions/filter?'
178 |             'transactionType=ALL&nextPageToken=&freeTextSearch=&isClearFreeTextSearch=false&'
179 |             'isClearFilterSelection=false&isClientSideFiltering=false&selectedCurrency=ALL&'
180 |             'startDate=%s&endDate=%s' % (start_date.strftime(date_format),
181 |                                          end_date.strftime(date_format)))
182 |         resp = self.make_json_request(url)
183 |         resp.raise_for_status()
184 |         j = resp.json()
185 |         jsonschema.validate(j, transaction_list_schema)
186 |         return j['data']['activity']['transactions']
187 | 
188 |     def save_transactions(self):
189 |         transaction_list = self.get_transaction_list()
190 |         logging.info('Got %d transactions', len(transaction_list))
191 |         for transaction in transaction_list:
192 |             transaction_id = transaction['id']
193 |             output_prefix = os.path.join(self.output_directory, transaction_id)
194 |             if transaction_id.startswith('INV'):
195 |                 pdf_path = output_prefix + '.pdf'
196 |                 if not os.path.exists(pdf_path):
197 |                     invoice_url = (
198 |                         'https://www.paypal.com/invoice/payerView/detailsInternal/'
199 |                         + transaction_id + '?printPdfMode=true')
200 |                     logging.info('Retrieving PDF %s', invoice_url)
201 |                     r = self.driver.request('GET', invoice_url)
202 |                     r.raise_for_status()
203 |                     data = r.content
204 |                     with atomic_write(pdf_path, mode='wb') as f:
205 |                         f.write(data)
206 |                 invoice_json_path = output_prefix + '.invoice.json'
207 |                 if not os.path.exists(invoice_json_path):
208 |                     with atomic_write(
209 |                             invoice_json_path,
210 |                             mode='w',
211 |                             encoding='utf-8',
212 |                             newline='\n') as f:
213 |                         f.write(json.dumps(transaction, indent='  '))
214 |                 continue
215 |             details_url = (
216 |                 'https://www.paypal.com/myaccount/transactions/details/' +
217 |                 transaction_id)
218 |             inline_details_url = (
219 |                 'https://www.paypal.com/myaccount/transactions/details/inline/'
220 |                 + transaction_id)
221 |             html_path = output_prefix + '.html'
222 |             json_path = output_prefix + '.json'
223 |             if not os.path.exists(html_path):
224 |                 logging.info('Retrieving HTML %s', details_url)
225 |                 html_resp = self.driver.request('GET', details_url)
226 |                 html_resp.raise_for_status()
227 |                 with atomic_write(
228 |                         html_path, mode='w', encoding='utf-8',
229 |                         newline='\n') as f:
230 |                     # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
231 |                     f.write('\ufeff' + html_resp.text)
232 |             if not os.path.exists(json_path):
233 |                 logging.info('Retrieving JSON %s', inline_details_url)
234 |                 json_resp = self.make_json_request(inline_details_url)
235 |                 json_resp.raise_for_status()
236 |                 j = json_resp.json()
237 |                 jsonschema.validate(j, transaction_details_schema)
238 |                 with atomic_write(json_path, mode='wb') as f:
239 |                     f.write(
240 |                         json.dumps(j['data']['details'], indent='  ').encode())
241 | 
242 |     def run(self):
243 |         if not os.path.exists(self.output_directory):
244 |             os.makedirs(self.output_directory)
245 |         self.login()
246 |         self.save_transactions()
247 | 
248 | 
249 | def run(**kwargs):
250 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
251 | 
252 | 
253 | def interactive(**kwargs):
254 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
255 | 


--------------------------------------------------------------------------------
/finance_dl/amazon.py:
--------------------------------------------------------------------------------
  1 | """Retrieves order invoices from Amazon.
  2 | 
  3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  4 | scrape the Amazon website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the output will be written.  If the directory does not
 16 |   exist, it will be created.
 17 | 
 18 | - `amazon_domain`: Optional.  Specifies the Amazon domain from which to download
 19 |   orders.  Must be one of `'.com'` or `'.co.cuk'`.  Defaults to `'.com'`.
 20 | 
 21 | - `regular`: Optional.  Must be a `bool`.  If `True` (the default), download regular orders.
 22 | 
 23 | - `digital`: Optional.  Must be a `bool` or `None`.  If `True`, download digital
 24 |   orders.  Defaults to `None`, which is equivalent to `True` for
 25 |   `amazon_domain=".com"`, and `False` for `amazon_domain=".co.uk"`.
 26 | 
 27 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 28 |   path to a persistent Chrome browser profile to use.  This should be a path
 29 |   used solely for this single configuration; it should not refer to your normal
 30 |   browser profile.  If not specified, a fresh temporary profile will be used
 31 |   each time.
 32 | 
 33 | Output format:
 34 | ==============
 35 | 
 36 | Each regular or digital order invoice is written in HTML format to the specified
 37 | `output_directory` using the naming scheme `<order-id>.html`,
 38 | e.g. `166-7926740-5141621.html` for a regular order invoice and
 39 | `D56-5204779-4181560.html` for a digital order invoice.
 40 | 
 41 | Example:
 42 | ========
 43 | 
 44 |     def CONFIG_amazon():
 45 |         return dict(
 46 |             module='finance_dl.amazon',
 47 |             credentials={
 48 |                 'username': 'XXXXXX',
 49 |                 'password': 'XXXXXX',
 50 |             },
 51 |             output_directory=os.path.join(data_dir, 'amazon'),
 52 |             # profile_dir is optional.
 53 |             profile_dir=os.path.join(profile_dir, 'amazon'),
 54 |         )
 55 | 
 56 | Interactive shell:
 57 | ==================
 58 | 
 59 | From the interactive shell, type: `self.run()` to start the scraper.
 60 | 
 61 | """
 62 | 
 63 | import urllib.parse
 64 | import re
 65 | import logging
 66 | import os
 67 | from selenium.webdriver.common.by import By
 68 | from selenium.webdriver.support.ui import Select
 69 | from selenium.webdriver.common.keys import Keys
 70 | from atomicwrites import atomic_write
 71 | from . import scrape_lib
 72 | 
 73 | logger = logging.getLogger('amazon_scrape')
 74 | 
 75 | 
 76 | class Domain:
 77 |     COM = 'com'
 78 |     CO_UK = 'co.uk'
 79 | 
 80 | 
 81 | class Scraper(scrape_lib.Scraper):
 82 |     def __init__(self, credentials, output_directory, amazon_domain=Domain.COM, regular=True, digital=None, **kwargs):
 83 |         super().__init__(**kwargs)
 84 |         default_digital = True if amazon_domain == Domain.COM else False
 85 |         self.credentials = credentials
 86 |         self.output_directory = output_directory
 87 |         self.logged_in = False
 88 |         self.amazon_domain = amazon_domain
 89 |         self.regular = regular
 90 |         self.digital = digital if digital is not None else default_digital
 91 | 
 92 |     def check_url(self, url):
 93 |         netloc_re = r'^([^\.@]+\.)*amazon.' + self.amazon_domain + '$'
 94 |         result = urllib.parse.urlparse(url)
 95 |         if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 96 |             raise RuntimeError('Reached invalid URL: %r' % url)
 97 | 
 98 |     def check_after_wait(self):
 99 |         self.check_url(self.driver.current_url)
100 | 
101 |     def login(self):
102 |         logger.info('Initiating log in')
103 |         self.driver.get('https://www.amazon.' + self.amazon_domain)
104 |         if self.logged_in:
105 |             return
106 | 
107 |         sign_out_links = self.find_elements_by_descendant_partial_text('Sign Out', 'a')
108 |         if len(sign_out_links) > 0:
109 |             logger.info('You must be already logged in!')
110 |             self.logged_in = True
111 |             return
112 | 
113 |         logger.info('Looking for sign-in link')
114 |         sign_in_links, = self.wait_and_return(
115 |             lambda: self.find_visible_elements_by_descendant_partial_text('Sign in', 'a')
116 |         )
117 | 
118 |         self.click(sign_in_links[0])
119 |         logger.info('Looking for username link')
120 |         (username, ), = self.wait_and_return(
121 |             lambda: self.find_visible_elements(By.XPATH, '//input[@type="email"]')
122 |         )
123 |         username.send_keys(self.credentials['username'])
124 | 
125 |         logger.info('Looking for password link')
126 |         (password, ), = self.wait_and_return(
127 |             lambda: self.find_visible_elements(By.XPATH, '//input[@type="password"]')
128 |         )
129 |         password.send_keys(self.credentials['password'])
130 | 
131 |         logger.info('Looking for "remember me" checkbox')
132 |         (rememberMe, ) = self.wait_and_return(
133 |             lambda: self.find_visible_elements(By.XPATH, '//input[@name="rememberMe"]')[0]
134 |         )
135 |         rememberMe.click()
136 | 
137 |         password.send_keys(Keys.ENTER)
138 | 
139 |         logger.info('Logged in')
140 |         self.logged_in = True
141 | 
142 |     def get_invoice_path(self, order_id):
143 |         return os.path.join(self.output_directory, order_id + '.html')
144 | 
145 |     def get_orders(self, regular=True, digital=True):
146 |         invoice_hrefs = []
147 |         order_ids_seen = set()
148 | 
149 |         def get_invoice_urls():
150 |             initial_iteration = True
151 |             while True:
152 | 
153 |                 def invoice_finder():
154 |                     return self.driver.find_elements(By.XPATH, '//a[contains(@href, "orderID=")]')
155 | 
156 |                 if initial_iteration:
157 |                     invoices = invoice_finder()
158 |                 else:
159 |                     invoices, = self.wait_and_return(invoice_finder)
160 |                 initial_iteration = False
161 | 
162 |                 order_ids = set()
163 |                 for invoice_link in invoices:
164 |                     href = invoice_link.get_attribute('href')
165 |                     m = re.match('.*[&?]orderID=((?:D)?[0-9\\-]+)(?:&.*)?$', href)
166 |                     if m is None:
167 |                         raise RuntimeError(
168 |                             'Failed to parse order ID from href %r' % (href, ))
169 |                     order_id = m[1]
170 |                     if order_id in order_ids:
171 |                         continue
172 |                     order_ids.add(order_id)
173 |                     invoice_path = self.get_invoice_path(order_id)
174 |                     if order_id in order_ids_seen:
175 |                         logger.info('Skipping already-seen order id: %r',
176 |                                     order_id)
177 |                         continue
178 |                     if os.path.exists(invoice_path):
179 |                         logger.info('Skipping already-downloaded invoice: %r',
180 |                                     order_id)
181 |                         continue
182 |                     print_url = 'https://www.amazon.%s/gp/css/summary/print.html?ie=UTF8&orderID=%s' % (
183 |                         self.amazon_domain, order_id)
184 |                     invoice_hrefs.append((print_url, order_id))
185 |                     order_ids_seen.add(order_id)
186 | 
187 |                 # Find next link
188 |                 next_links = self.find_elements_by_descendant_text_match(
189 |                     '. = "Next"', 'a', only_displayed=True)
190 |                 if len(next_links) == 0:
191 |                     logger.info('Found no more pages')
192 |                     break
193 |                 if len(next_links) != 1:
194 |                     raise RuntimeError('More than one next link found')
195 |                 with self.wait_for_page_load():
196 |                     self.click(next_links[0])
197 | 
198 |         def retrieve_all_order_groups():
199 |             order_select_index = 0
200 | 
201 |             while True:
202 |                 (order_filter,), = self.wait_and_return(
203 |                     lambda: self.find_visible_elements(By.XPATH, '//select[@name="orderFilter"]')
204 |                 )
205 |                 order_select = Select(order_filter)
206 |                 num_options = len(order_select.options)
207 |                 if order_select_index >= num_options:
208 |                     break
209 |                 option_text = order_select.options[
210 |                     order_select_index].text.strip()
211 |                 if option_text != 'Archived Orders':
212 |                     logger.info('Retrieving order group: %r', option_text)
213 |                     with self.wait_for_page_load():
214 |                         order_select.select_by_index(order_select_index)
215 |                     get_invoice_urls()
216 | 
217 |                 order_select_index += 1
218 |                 if order_select_index >= num_options:
219 |                     break
220 | 
221 |         if regular:
222 |             orders_text = "Your Orders" if self.amazon_domain == Domain.CO_UK else "Orders"
223 |             # on co.uk, orders link is hidden behind the menu, hence not directly clickable
224 |             (orders_link,), = self.wait_and_return(
225 |                 lambda: self.find_elements_by_descendant_text_match('. = "{}"'.format(orders_text), 'a', only_displayed=False)
226 |             )
227 |             link = orders_link.get_attribute('href')
228 |             scrape_lib.retry(lambda: self.driver.get(link), retry_delay=2)
229 | 
230 |             retrieve_all_order_groups()
231 | 
232 |         if digital:
233 |             (digital_orders_link,), = self.wait_and_return(
234 |                 lambda: self.find_elements_by_descendant_text_match('contains(., "Digital Orders")', 'a', only_displayed=True)
235 |             )
236 |             scrape_lib.retry(lambda: self.click(digital_orders_link),
237 |                              retry_delay=2)
238 |             retrieve_all_order_groups()
239 | 
240 |         self.retrieve_invoices(invoice_hrefs)
241 | 
242 |     def retrieve_invoices(self, invoice_hrefs):
243 |         for href, order_id in invoice_hrefs:
244 |             invoice_path = self.get_invoice_path(order_id)
245 | 
246 |             logger.info('Downloading invoice for order %r', order_id)
247 |             with self.wait_for_page_load():
248 |                 self.driver.get(href)
249 | 
250 |             # For digital orders, Amazon dynamically generates some of the information.
251 |             # Wait until it is all generated.
252 |             def get_source():
253 |                 source = self.driver.page_source
254 |                 if 'Grand Total:' in source:
255 |                     return source
256 |                 return None
257 | 
258 |             page_source, = self.wait_and_return(get_source)
259 |             if order_id not in page_source:
260 |                 raise ValueError('Failed to retrieve information for order %r'
261 |                                  % (order_id, ))
262 |             with atomic_write(
263 |                     invoice_path, mode='w', encoding='utf-8',
264 |                     newline='\n') as f:
265 |                 # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
266 |                 f.write('\ufeff' + page_source)
267 |             logger.info('  Wrote %s', invoice_path)
268 | 
269 |     def run(self):
270 |         self.login()
271 |         if not os.path.exists(self.output_directory):
272 |             os.makedirs(self.output_directory)
273 |         self.get_orders(regular=self.regular, digital=self.digital)
274 | 
275 | 
276 | def run(**kwargs):
277 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
278 | 
279 | 
280 | def interactive(**kwargs):
281 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
282 | 


--------------------------------------------------------------------------------
/finance_dl/venmo.py:
--------------------------------------------------------------------------------
  1 | """Retrieves transaction and balance information from Venmo.
  2 | 
  3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  4 | scrape the Venmo website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the output will be written.  If the directory does not
 16 |   exist, it will be created.
 17 | 
 18 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 19 |   path to a persistent Chrome browser profile to use.  This should be a path
 20 |   used solely for this single configuration; it should not refer to your normal
 21 |   browser profile.  If not specified, a fresh temporary profile will be used
 22 |   each time.  It is highly recommended to specify a `profile_dir` to avoid
 23 |   having to manually enter a multi-factor authentication code each time.
 24 | 
 25 | - `earliest_history_date`: Optional.  If specified, must be a `datetime.date`
 26 |   specifying the earliest UTC date for which to retrieve data.
 27 | 
 28 | - `max_history_days`: Optional.  If `earliest_history_date` is not specified,
 29 |   this must be a positive `int` specifying the number of days of history to
 30 |   retrieve, starting from the previous UTC day.  Defaults to `365*4`.  If
 31 |   `earliest_history_date` is specified, `max_history_days` has no effect.
 32 | 
 33 | Output format:
 34 | ==============
 35 | 
 36 | The retrieved transaction and balance information is merged into the
 37 | `transactions.csv` and `balances.csv` files within the specified
 38 | `output_directory`.  Note that any existing transaction and balance information
 39 | in those files is not overwritten; instead, new information is merged in without
 40 | introducing duplicates.
 41 | 
 42 | The `transactions.csv` file is in the same CSV download format provided directly
 43 | from the Venmo website, and has the format:
 44 | 
 45 | " ID","Datetime","Type","Status","Note","From","To","Amount (total)","Amount (fee)","Funding Source","Destination"
 46 | 
 47 | The `balances.csv` file is created from scraping the HTML and has the format:
 48 | 
 49 | "Start Date","End Date","Start Balance","End Balance"
 50 | 
 51 | Example:
 52 | ========
 53 | 
 54 |     def CONFIG_venmo():
 55 |         return dict(
 56 |             module='finance_dl.venmo',
 57 |             credentials={
 58 |                 'username': 'XXXXXX',
 59 |                 'password': 'XXXXXX',
 60 |             },
 61 |             output_directory=os.path.join(data_dir, 'venmo'),
 62 | 
 63 |             # profile_dir is optional but highly recommended to avoid having to
 64 |             # enter multi-factor authentication code each time.
 65 |             profile_dir=os.path.join(profile_dir, 'venmo'),
 66 |         )
 67 | 
 68 | Interactive shell:
 69 | ==================
 70 | 
 71 | From the interactive shell, type: `self.run()` to start the scraper.
 72 | 
 73 | """
 74 | 
 75 | import io
 76 | import csv
 77 | import urllib.parse
 78 | import re
 79 | import dateutil.parser
 80 | import datetime
 81 | import logging
 82 | import os
 83 | from selenium.webdriver.common.by import By
 84 | from selenium.common.exceptions import NoSuchElementException
 85 | from selenium.webdriver.support.ui import Select
 86 | from selenium.webdriver.common.keys import Keys
 87 | 
 88 | from . import scrape_lib
 89 | from . import csv_merge
 90 | 
 91 | logger = logging.getLogger('venmo_scrape')
 92 | 
 93 | netloc_re = r'^([^\.@]+\.)*venmo.com$'
 94 | 
 95 | 
 96 | def check_url(url):
 97 |     result = urllib.parse.urlparse(url)
 98 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 99 |         raise RuntimeError('Reached invalid URL: %r' % url)
100 | 
101 | 
102 | balance_field_names = [
103 |     'Start Date', 'End Date', 'Start Balance', 'End Balance'
104 | ]
105 | 
106 | standard_date_format = '%Y-%m-%d'
107 | 
108 | 
109 | def parse_csv_date(x):
110 |     return dateutil.parser.parse(
111 |         x, ignoretz=True).replace(tzinfo=datetime.timezone.utc)
112 | 
113 | 
114 | class Scraper(scrape_lib.Scraper):
115 |     def __init__(self, credentials, output_directory,
116 |                  earliest_history_date=None, max_history_days=365 * 4,
117 |                  **kwargs):
118 |         """
119 |         @param earliest_history_date: Earliest UTC date for which to retrieve
120 |             transactions and balance information.
121 | 
122 |         @param max_history_days: Number of days of history to retrieve, starting
123 |             from the previous UTC day, if `earliest_history_date` is not
124 |             specified.
125 |         """
126 |         super().__init__(**kwargs)
127 |         self.credentials = credentials
128 |         self.output_directory = output_directory
129 |         if not os.path.exists(self.output_directory):
130 |             os.makedirs(self.output_directory)
131 |         self.transactions_path = os.path.join(output_directory,
132 |                                               'transactions.csv')
133 |         self.balances_path = os.path.join(output_directory, 'balances.csv')
134 |         self.latest_history_date = (
135 |             datetime.datetime.now() - datetime.timedelta(days=1)).astimezone(
136 |                 datetime.timezone.utc).date()
137 |         if earliest_history_date is None:
138 |             self.earliest_history_date = self.latest_history_date - datetime.timedelta(
139 |                 days=max_history_days)
140 |         else:
141 |             self.earliest_history_date = dateutil.parser.parse(
142 |                 earliest_history_date).date()
143 |         self.logged_in = False
144 | 
145 |     def check_after_wait(self):
146 |         check_url(self.driver.current_url)
147 | 
148 |     def login(self):
149 |         if self.logged_in:
150 |             return
151 |         logger.info('Initiating log in')
152 |         self.driver.get('https://venmo.com/account/sign-in')
153 | 
154 |         (username, password), = self.wait_and_return(
155 |             self.find_username_and_password_in_any_frame)
156 |         logger.info('Entering username and password')
157 |         username.send_keys(self.credentials['username'])
158 |         password.send_keys(self.credentials['password'])
159 |         with self.wait_for_page_load():
160 |             password.send_keys(Keys.ENTER)
161 |         logger.info('Logged in')
162 |         self.logged_in = True
163 | 
164 |     def goto_statement(self, start_date, end_date):
165 |         url_date_format = '%m-%d-%Y'
166 |         with self.wait_for_page_load():
167 |             self.driver.get(
168 |                 'https://venmo.com/account/statement?end=%s&start=%s' %
169 |                 (end_date.strftime(url_date_format),
170 |                  start_date.strftime(url_date_format)))
171 | 
172 |     def download_csv(self):
173 |         logger.info('Looking for CSV link')
174 |         download_button, = self.wait_and_locate(
175 |             (By.XPATH, '//button[text() = "Download CSV"]'))
176 |         self.click(download_button)
177 |         logger.info('Waiting for CSV download')
178 |         download_result, = self.wait_and_return(self.get_downloaded_file)
179 |         logger.info('Got CSV download')
180 |         return download_result[1]
181 | 
182 |     def get_balance(self, balance_type):
183 |         try:
184 |             balance_node = self.driver.find_element(
185 |                 By.XPATH, '//*[@class="%s"]/child::*[@class="balance-amt"]' %
186 |                 balance_type)
187 |             return balance_node.text
188 |         except NoSuchElementException:
189 |             return None
190 | 
191 |     def get_balances(self):
192 |         def maybe_get_balance():
193 |             start_balance = self.get_balance('start-balance')
194 |             end_balance = self.get_balance('end-balance')
195 |             if start_balance is not None and end_balance is not None:
196 |                 return (start_balance, end_balance)
197 |             try:
198 |                 error_node = self.driver.find_element(
199 |                     By.XPATH, '//*[@class="account-statement-error"]')
200 |                 error_text = error_node.text
201 |                 logging.info('Saw error text: %s', error_text)
202 |                 if error_text.startswith('Loading'):
203 |                     return None
204 |                 return ('unknown', 'unknown')
205 |             except NoSuchElementException:
206 |                 return None
207 | 
208 |         result, = self.wait_and_return(maybe_get_balance)
209 |         return result
210 | 
211 |     def write_csv(self, csv_result):
212 |         csv_reader = csv.DictReader(
213 |             io.StringIO(csv_result.decode(), newline=''))
214 |         field_names = csv_reader.fieldnames
215 |         rows = list(csv_reader)
216 | 
217 |         def get_sort_key(row):
218 |             return parse_csv_date(row['Datetime']).timestamp()
219 | 
220 |         transactions_file = os.path.join(self.output_directory,
221 |                                          'transactions.csv')
222 |         csv_merge.merge_into_file(filename=transactions_file,
223 |                                   field_names=field_names, data=rows,
224 |                                   sort_by=get_sort_key)
225 | 
226 |     def get_existing_balances(self):
227 |         if not os.path.exists(self.balances_path):
228 |             return []
229 |         with open(self.balances_path, 'r', newline='', encoding='utf-8') as f:
230 |             csv_reader = csv.DictReader(f)
231 |             assert csv_reader.fieldnames == balance_field_names
232 |             return list(csv_reader)
233 | 
234 |     def get_start_date(self):
235 |         existing_balances = self.get_existing_balances()
236 |         if not existing_balances:
237 |             return self.earliest_history_date
238 |         return max(
239 |             datetime.datetime.strptime(row['End Date'], standard_date_format)
240 |             .date() for row in existing_balances) + datetime.timedelta(days=1)
241 | 
242 |     def fetch_statement(self, start_date, end_date):
243 |         logging.info('Fetching statement: [%s, %s]',
244 |                      start_date.strftime(standard_date_format),
245 |                      end_date.strftime(standard_date_format))
246 |         self.goto_statement(start_date, end_date)
247 |         start_balance, end_balance = self.get_balances()
248 |         # Write transactions before balance information, to make sure if an error occurs we will retry next time
249 |         if (start_balance, end_balance) != ('unknown', 'unknown'):
250 |             csv_data = self.download_csv()
251 |             self.write_csv(csv_data)
252 |         else:
253 |             logging.info(
254 |                 'Skipping fetching transactions CSV because current period has no transactions'
255 |             )
256 |         csv_merge.merge_into_file(
257 |             filename=self.balances_path,
258 |             field_names=balance_field_names,
259 |             data=[{
260 |                 'Start Date': start_date.strftime(standard_date_format),
261 |                 'End Date': end_date.strftime(standard_date_format),
262 |                 'Start Balance': start_balance,
263 |                 'End Balance': end_balance,
264 |             }],
265 |             sort_by=lambda row: (row['Start Date'], row['End Date']),
266 |         )
267 | 
268 |     def fetch_history(self):
269 | 
270 |         start_date = self.get_start_date()
271 |         logging.info('Fetching history starting from %s',
272 |                      start_date.strftime('%Y-%m-%d'))
273 | 
274 |         while start_date <= self.latest_history_date:
275 |             end_date = min(self.latest_history_date,
276 |                            start_date + datetime.timedelta(days=89))
277 |             self.fetch_statement(start_date, end_date)
278 |             start_date = end_date + datetime.timedelta(days=1)
279 | 
280 |     def run(self):
281 |         self.login()
282 |         self.fetch_history()
283 | 
284 | 
285 | def run(**kwargs):
286 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
287 | 
288 | 
289 | def interactive(**kwargs):
290 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
291 | 


--------------------------------------------------------------------------------
/finance_dl/healthequity.py:
--------------------------------------------------------------------------------
  1 | """Retrieves transaction and balance information from HealthEquity.
  2 | 
  3 | This uses the `selenium` Python package in conjunction with `chromedriver` to
  4 | scrape the Venmo website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the output will be written.  If the directory does not
 16 |   exist, it will be created.  For compatibility with `beancount-import`, the
 17 |   last component of the `output_directory` should be your HealthEquity account
 18 |   number.
 19 | 
 20 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 21 |   path to a persistent Chrome browser profile to use.  This should be a path
 22 |   used solely for this single configuration; it should not refer to your normal
 23 |   browser profile.  If not specified, a fresh temporary profile will be used
 24 |   each time.  It is highly recommended to specify a `profile_dir` to avoid
 25 |   having to manually enter a multi-factor authentication code each time.
 26 | 
 27 | Output format:
 28 | ==============
 29 | 
 30 | Cash transactions relating to contributions, distributions, and other are saved
 31 | to `cash-transactions-contribution.csv`, `cash-transactions-distribution.csv`,
 32 | and `cash-transactions-other.csv`, respectively, with the following fields:
 33 | 
 34 |     "Date","Transaction","Amount","Cash Balance"
 35 | 
 36 | Investment transactions are saved to `investment-transactions.csv` with the
 37 | following fields:
 38 | 
 39 |     "Date","Fund","Category","Description","Price","Amount","Shares","Total Shares","Total Value"
 40 | 
 41 | Investment holdings are saved to files named like
 42 | `YYYY-MM-ddTHHMMSSZZZZ.balances.csv`, where the date and time are the date and
 43 | time at which the scraper was run.
 44 | 
 45 | Example:
 46 | ========
 47 | 
 48 |     def CONFIG_healthequity():
 49 |         return dict(
 50 |             module='finance_dl.healthequity',
 51 |             credentials={
 52 |                 'username': 'XXXXXX',
 53 |                 'password': 'XXXXXX',
 54 |             },
 55 |             # Use your HealthEquity account number as the last directory component.
 56 |             output_directory=os.path.join(data_dir, 'healthequity', '1234567'),
 57 | 
 58 |             # profile_dir is optional but highly recommended to avoid having to
 59 |             # enter multi-factor authentication code each time.
 60 |             profile_dir=os.path.join(profile_dir, 'healthequity'),
 61 |         )
 62 | 
 63 | Interactive shell:
 64 | ==================
 65 | 
 66 | From the interactive shell, type: `self.run()` to start the scraper.
 67 | 
 68 | """
 69 | 
 70 | import urllib.parse
 71 | import re
 72 | import datetime
 73 | import time
 74 | import logging
 75 | import os
 76 | import bs4
 77 | from selenium.webdriver.common.by import By
 78 | from selenium.webdriver.support.ui import Select
 79 | from selenium.webdriver.common.keys import Keys
 80 | from . import scrape_lib
 81 | from . import csv_merge
 82 | 
 83 | logger = logging.getLogger('healthequity_scrape')
 84 | 
 85 | netloc_re = r'^([^\.@]+\.)*healthequity.com$'
 86 | 
 87 | 
 88 | def check_url(url):
 89 |     result = urllib.parse.urlparse(url)
 90 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
 91 |         raise RuntimeError('Reached invalid URL: %r' % url)
 92 | 
 93 | 
 94 | def find_first_matching_date(lines, date_format):
 95 |     for line in lines:
 96 |         try:
 97 |             return datetime.datetime.strptime(line, date_format).date()
 98 |         except:
 99 |             pass
100 |     return None
101 | 
102 | 
103 | FUND_ACTIVITY_HEADERS = [
104 |     'Fund', 'Name', 'Shares (#)', 'Closing Price', 'Closing Value'
105 | ]
106 | 
107 | 
108 | def write_balances(data, path):
109 |     rows = []
110 |     for entry in data:
111 |         keys = [x[0] for x in entry]
112 |         if keys == FUND_ACTIVITY_HEADERS:
113 |             row_values = dict(entry)
114 |             row_values['Fund'] = row_values['Fund'].strip().split()[0]
115 |             rows.append(row_values)
116 |     csv_merge.write_csv(FUND_ACTIVITY_HEADERS, rows, path)
117 | 
118 | 
119 | def write_fund_activity(raw_transactions_data, path):
120 |     input_date_format = '%m/%d/%Y'
121 |     output_date_format = '%Y-%m-%d'
122 |     soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml')
123 |     headers = [
124 |         'Date', 'Fund', 'Category', 'Description', 'Price', 'Amount', 'Shares',
125 |         'Total Shares', 'Total Value'
126 |     ]
127 |     rows = []
128 |     for row in soup.find_all('tr'):
129 |         cells = [str(x.text).strip() for x in row.find_all('td')]
130 |         while cells and not cells[-1].strip():
131 |             del cells[-1]
132 |         if len(cells) == 1:
133 |             continue
134 |         assert len(cells) == len(headers)
135 |         if cells == headers:
136 |             continue
137 |         row_values = dict(zip(headers, cells))
138 |         row_values['Date'] = datetime.datetime.strptime(
139 |             row_values['Date'], input_date_format).strftime(output_date_format)
140 |         rows.append(row_values)
141 |     csv_merge.merge_into_file(filename=path, field_names=headers, data=rows,
142 |                               sort_by=lambda x: x['Date'])
143 | 
144 | 
145 | def write_transactions(raw_transactions_data, path):
146 |     input_date_format = '%m/%d/%Y'
147 |     output_date_format = '%Y-%m-%d'
148 |     soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml')
149 |     headers = ['Date', 'Transaction', 'Amount', 'HSA Cash Balance']
150 |     output_headers = ['Date', 'Transaction', 'Amount', 'Cash Balance']
151 |     rows = []
152 |     for row in soup.find_all('tr'):
153 |         cells = [str(x.text).strip() for x in row.find_all('td')]
154 |         while cells and not cells[-1].strip():
155 |             del cells[-1]
156 |         if len(cells) <= 1:
157 |             continue
158 |         if cells[0] == 'TOTAL':
159 |             continue
160 |         assert len(cells) == len(headers)
161 |         if cells == headers:
162 |             continue
163 |         row_values = dict(zip(headers, cells))
164 |         # Sanitize whitespace in description
165 |         row_values['Transaction'] = ' '.join(row_values['Transaction'].split())
166 |         row_values['Cash Balance'] = row_values.pop('HSA Cash Balance')
167 | 
168 |         # Sanitize date_str
169 |         date_str = row_values['Date']
170 |         date_str = re.sub('\\(Available .*\\)', '', date_str)
171 | 
172 |         row_values['Date'] = datetime.datetime.strptime(
173 |             date_str, input_date_format).strftime(output_date_format)
174 |         rows.append(row_values)
175 |     rows.reverse()
176 |     csv_merge.merge_into_file(filename=path, field_names=output_headers,
177 |                               data=rows, sort_by=lambda x: x['Date'])
178 | 
179 | 
180 | class Scraper(scrape_lib.Scraper):
181 |     def __init__(self, credentials, output_directory, **kwargs):
182 |         super().__init__(**kwargs)
183 |         self.credentials = credentials
184 |         self.output_directory = output_directory
185 |         self.logged_in = False
186 | 
187 |     def check_after_wait(self):
188 |         check_url(self.driver.current_url)
189 | 
190 |     def login(self):
191 |         if self.logged_in:
192 |             return
193 |         logger.info('Initiating log in')
194 |         self.driver.get('https://my.healthequity.com/')
195 | 
196 |         (username, password), = self.wait_and_return(
197 |             self.find_username_and_password_in_any_frame)
198 |         logger.info('Entering username and password')
199 |         username.send_keys(self.credentials['username'])
200 |         password.send_keys(self.credentials['password'])
201 |         with self.wait_for_page_load():
202 |             password.send_keys(Keys.ENTER)
203 |         logger.info('Logged in')
204 |         self.logged_in = True
205 | 
206 |     def download_transaction_history(self):
207 |         (transactions_link, ), = self.wait_and_return(
208 |             lambda: self.find_visible_elements_by_descendant_partial_text('Transaction History', 'td'))
209 |         scrape_lib.retry(transactions_link.click, retry_delay=2)
210 |         (date_select, ), = self.wait_and_return(
211 |             lambda: self.find_visible_elements_by_descendant_partial_text('All dates', 'select'))
212 |         date_select = Select(date_select)
213 |         with self.wait_for_page_load():
214 |             date_select.select_by_visible_text('All dates')
215 | 
216 |         results = {}
217 |         for transaction_type in ['Contribution', 'Distribution', 'Other']:
218 |             logger.info('Retrieving transaction history of type %s',
219 |                         transaction_type)
220 |             (type_select, ), = self.wait_and_return(
221 |                 lambda: self.find_visible_elements_by_descendant_partial_text('All Transaction Types', 'select'))
222 |             type_select = Select(type_select)
223 |             with self.wait_for_page_load():
224 |                 type_select.select_by_visible_text(transaction_type)
225 | 
226 |             (download_link,), = self.wait_and_return(
227 |                 lambda: self.find_visible_elements(By.XPATH, '//input[contains(@value,"Download")]'))
228 |             scrape_lib.retry(download_link.click, retry_delay=2)
229 |             # (excel_link,), = self.wait_and_return(
230 |             #     lambda: self.find_visible_elements(By.XPATH, '//input[contains(@name,"Excel")]'))
231 |             # scrape_lib.retry(excel_link.click, retry_delay=2)
232 |             logger.info('Waiting for downloaded transaction history')
233 |             download_result, = self.wait_and_return(self.get_downloaded_file)
234 |             results[transaction_type] = download_result[1]
235 |             self.driver.back()  # undo selection of transaction type
236 |             self.driver.refresh()
237 | 
238 |         self.driver.back()  # undo selection of "All dates"
239 |         self.driver.back()  # undo selection of "Transaction history"
240 |         self.driver.refresh()
241 | 
242 |         return results
243 | 
244 |     def get_investment_balance(self):
245 |         headers = FUND_ACTIVITY_HEADERS
246 |         (table, ), = self.wait_and_return(
247 |             lambda: scrape_lib.find_table_by_headers(self, headers))
248 |         data = scrape_lib.extract_table_data(table, headers)
249 |         return data
250 | 
251 |     def go_to_investment_history(self):
252 |         logger.info('Going to investment history')
253 |         self.driver.get(
254 |             'https://www.healthequity.com/Member/Investment/Desktop.aspx')
255 | 
256 |     def download_fund_activity(self):
257 |         logger.info('Looking for fund activity link')
258 |         (fund_activity_link,), = self.wait_and_return(
259 |             lambda: self.find_visible_elements(By.XPATH, '//a[contains(@href, "FundActivity")]'))
260 |         scrape_lib.retry(fund_activity_link.click, retry_delay=2)
261 |         logger.info('Selecting date ranage for fund activity')
262 |         (start_date,), = self.wait_and_return(
263 |             lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "dateSelectStart")]'))
264 |         start_date.clear()
265 |         start_date.send_keys('01011900')
266 |         logger.info('Downloading fund activity')
267 |         (download_link, ), = self.wait_and_return(
268 |             lambda: self.driver.find_elements_by_link_text('Download'))
269 |         scrape_lib.retry(download_link.click, retry_delay=2)
270 |         logger.info('Waiting for fund activity download')
271 |         download_result, = self.wait_and_return(self.get_downloaded_file)
272 |         return download_result[1]
273 | 
274 |     def download_data(self):
275 |         raw_transactions = self.download_transaction_history()
276 |         self.go_to_investment_history()
277 |         raw_balances = self.get_investment_balance()
278 |         raw_fund_activity = self.download_fund_activity()
279 |         return raw_transactions, raw_balances, raw_fund_activity
280 | 
281 |     def run(self):
282 |         self.login()
283 |         if not os.path.exists(self.output_directory):
284 |             os.makedirs(self.output_directory)
285 |         raw_transactions, raw_balances, raw_fund_activity = self.download_data(
286 |         )
287 |         write_balances(
288 |             raw_balances,
289 |             os.path.join(
290 |                 self.output_directory,
291 |                 '%s.balances.csv' % time.strftime('%Y-%m-%dT%H%M%S%z')))
292 |         for k, v in raw_transactions.items():
293 |             write_transactions(
294 |                 v,
295 |                 os.path.join(self.output_directory,
296 |                              'cash-transactions-%s.csv' % (k.lower())))
297 |         write_fund_activity(
298 |             raw_fund_activity,
299 |             os.path.join(self.output_directory, 'investment-transactions.csv'))
300 | 
301 | 
302 | def run(**kwargs):
303 |     scrape_lib.run_with_scraper(Scraper, **kwargs)
304 | 
305 | 
306 | def interactive(**kwargs):
307 |     return scrape_lib.interact_with_scraper(Scraper, **kwargs)
308 | 


--------------------------------------------------------------------------------
/finance_dl/ofx.py:
--------------------------------------------------------------------------------
  1 | """Retrieves OFX transaction and balance information using the OFX protocol.
  2 | 
  3 | This module uses the `ofxclient` Python package to connect directly to financial
  4 | institutions that support the OFX protocol.
  5 | 
  6 | Refer to https://www.ofxhome.com/ to search for OFX connection information for
  7 | your financial institution.
  8 | 
  9 | Configuration:
 10 | ==============
 11 | 
 12 | The following keys may be specified as part of the configuration dict:
 13 | 
 14 | - `ofx_params`: Required.  Must be a `dict` with the following fields:
 15 |   - `id`: FI Id value (refer to https://www.ofxhome.com/)
 16 |   - `org`: FI Org value (refer to https://www.ofxhome.com/)
 17 |   - `url`: FI Url value (refer to https://www.ofxhome.com/)
 18 |   - `username`: Username for your account.
 19 |   - `password`: Password for your account.
 20 |   - `client_args`: Optional.  `dict` of additional arguments to pass to the
 21 |     `ofxclient` library.  Some banks, such as Chase, require that the OFX
 22 |     version be set to at least 103 and a unique client id be specified.  This
 23 |     can be achieved using a `client_args` value of:
 24 | 
 25 |         dict(
 26 |             ofx_version='103',
 27 |             id='64f0e0bfe04f1a2d32cbddc8d30a3017',
 28 |         )
 29 | 
 30 |     where `id` is a random hex string obtained from e.g.:
 31 |     `openssl rand -hex 16`.
 32 | 
 33 | - `output_directory`: Required.  Must be a `str` that specifies the path to the
 34 |   directory where OFX files are to be written.  If it does not exist, it will be
 35 |   created.
 36 | 
 37 | - `overlap_days`: Optional.  An `int` that specifies the number of days of
 38 |   overlap to use when retrieving additional transactions.  This is intended to
 39 |   reduce the chances of transactions being missed (and duplicate transactions
 40 |   can easily be filtered when processing the downloaded data).  The default
 41 |   value of `2` should be suitable in almost all cases.
 42 | 
 43 | - `min_start_date`: Optional.  A `datetime.date` object specifying the earliest
 44 |   date at which to attempt to retrieve data.  If no existing files are present
 45 |   for this account in the output directory, a binary search is done starting
 46 |   from this date to determine the first date for which the server returns a
 47 |   valid response.  Otherwise, it is ignored.  Defaults to `1990-01-01`, which
 48 |   should be suitable in almost all cases.
 49 | 
 50 | - `min_days_retrieved`: Optional.  An `int` specifying the minimum number of
 51 |   days for which the server is expected to give data.  It is assumed that if a
 52 |   request is made starting no more than this many days from today, that all
 53 |   transactions will be received, and no additional request will be made.  The
 54 |   default value of `20` should be suitable in most cases, as most servers
 55 |   support returning at least 30 days of transactions per request.
 56 | 
 57 | Output format:
 58 | ==============
 59 | 
 60 | This module downloads OFX data for all accounts that are accessible using the
 61 | specified `username`.  The data for each account is stored in the sub-directory
 62 | of the specified `output_directory` with a name equal to the account number.  If
 63 | the sub-directory does not exist, it will be created.
 64 | 
 65 | Within each account sub-directory, OFX files are saved using the file naming
 66 | scheme:
 67 | 
 68 |     <start-date>-<end-date>--<fetch-timestamp>.ofx
 69 | 
 70 | where <start-date> and <end-date> are in YYYYMMDD format and <fetch-timestamp>
 71 | is in seconds since epoch.  The start and end dates reflect the DTSTART and
 72 | DTEND fields in the OFX file.
 73 | 
 74 | Because some institutions only allow a limited number of days of data to be
 75 | retrieved in a single request, this program automatically issues repeated
 76 | requests in order to download all available.
 77 | 
 78 | If no files have already been downloaded, a binary search is used to find the
 79 | oldest point at which data is available.
 80 | 
 81 | Requests are issued repeatedly to fill any gaps in the range of downloaded data,
 82 | and to extend the range towards the present date.
 83 | 
 84 | At least one request extending up to the present date is always issued in order
 85 | to ensure up-to-date information is available.
 86 | 
 87 | Example:
 88 | ========
 89 | 
 90 |     def CONFIG_vanguard():
 91 |         # To determine the correct values for `id`, `org`, and `url` for your
 92 |         # financial institution, search on https://www.ofxhome.com/
 93 |         ofx_params = {
 94 |             'id': '15103',
 95 |             'org': 'Vanguard',
 96 |             'url': 'https://vesnc.vanguard.com/us/OfxDirectConnectServlet',
 97 |             'username': 'XXXXXX',
 98 |             'password': 'XXXXXX',
 99 |         }
100 |         return dict(
101 |             module='finance_dl.ofx',
102 |             ofx_params=ofx_params,
103 |             output_directory=os.path.join(data_dir, 'vanguard'),
104 |         )
105 | 
106 | """
107 | 
108 | import contextlib
109 | import warnings
110 | import datetime
111 | import os
112 | import time
113 | import re
114 | import logging
115 | import io
116 | 
117 | from atomicwrites import atomic_write
118 | import bs4
119 | import dateutil.parser
120 | import ofxclient.institution
121 | import ofxclient
122 | 
123 | from beancount.ingest.importers.ofx import parse_ofx_time, find_child
124 | 
125 | warnings.filterwarnings('ignore', message='split()', module='re')
126 | 
127 | logger = logging.getLogger('ofx')
128 | 
129 | 
130 | def sanitize_account_name(account_name: str):
131 |     """Replaces any sequence of invalid characters in the account name with a dash.
132 | 
133 |     Returns the sanitized account name.
134 |     """
135 |     if account_name == '.' or account_name == '..':
136 |         raise ValueError('Invalid account name: %s' % account_name)
137 | 
138 |     return re.sub('[^a-z0-9A-Z.-]+', '-', account_name)
139 | 
140 | 
141 | def download_account_data_starting_from(account: ofxclient.account.Account,
142 |                                         date: datetime.date):
143 |     logger.info('Trying to retrieve data for %s starting at %s.',
144 |                 account.number, date)
145 |     num_days = (datetime.date.today() - date).days
146 |     return account.download(days=num_days).read().encode('ascii')
147 | 
148 | 
149 | def get_ofx_date_range(data: bytes):
150 |     soup = bs4.BeautifulSoup(io.BytesIO(data), 'html.parser')
151 |     dtstart = find_child(soup, 'dtstart', parse_ofx_time)
152 |     dtend = find_child(soup, 'dtend', parse_ofx_time)
153 |     if dtstart is None or dtend is None:
154 |         logger.debug('Data received: %r', data)
155 |         messages = soup.find_all('message')
156 |         logger.info('Messages: %r', [message.text for message in messages])
157 |         return None
158 |     return dtstart, dtend
159 | 
160 | 
161 | def get_earliest_data(account, start_date):
162 |     """Try to retrieve earliest batch of account data, starting at `start_date'.
163 | 
164 |     Uses binary search to find the earliest point after start_date that yields a valid response.
165 | 
166 |     Returns ((startdate, enddate), data).
167 |     """
168 |     logger.info(
169 |         'Binary searching to find earliest data available for account %s.',
170 |         account.number)
171 |     lower_bound = start_date
172 |     upper_bound = datetime.date.today()
173 |     valid_data = None
174 |     valid_date_range = None
175 |     while lower_bound + datetime.timedelta(days=1) < upper_bound:
176 |         mid = lower_bound + datetime.timedelta(days=(upper_bound - lower_bound
177 |                                                      ).days // 2)
178 |         data = download_account_data_starting_from(account, mid)
179 |         date_range = get_ofx_date_range(data)
180 |         if date_range is not None:
181 |             upper_bound = mid
182 |             valid_data = data
183 |             valid_date_range = date_range
184 |         else:
185 |             lower_bound = mid
186 |     if not valid_data:
187 |         raise RuntimeError('Failed to retrieve any data for account: %s' %
188 |                            account.number)
189 |     return valid_date_range, valid_data
190 | 
191 | 
192 | def save_single_account_data(
193 |         account: ofxclient.account.Account, output_dir: str, overlap_days=2,
194 |         min_days_retrieved=20,
195 |         min_start_date: datetime.date = dateutil.parser.parse(
196 |             '1990-01-01').date(),
197 |         always_save=True):
198 |     """Attempts to download all transactions for the specified account.
199 | 
200 |     :param account: The connected account for which to download data.
201 |     :param output_dir: Path to filesystem directory in which to store the
202 |         downloaded OFX files.  It will be (recursively) created if it does not
203 |         exist.  Saved files will be named
204 |         "<start-date>-<end-date>--<fetch-timestamp>.ofx", where <start-date> and
205 |         <end-date> are in YYYYMMDD format and <fetch-timestamp> is in seconds
206 |         since epoch.  Date ranges corresponding to existing files with this
207 |         naming pattern will not be re-downloaded.
208 |     :param overlap_days: The number of days of overlap to use when retrieving
209 |         additional transactions.  This is intended to reduce the chances of
210 |         transactions being missed (and duplicate transactions can easily be
211 |         filtered when processing the downloaded data).  The default value should
212 |         be suitable in almost all cases.
213 |     :param min_days_retrieved: The minimum number of days the server is expected
214 |         to give data for.  This function assumes that if a request is made
215 |         starting no more than this many days from today, that all transactions
216 |         will be received, and no additional request will be made.  The default
217 |         value should be suitable in most cases, as most servers support
218 |         returning at least 30 days of transactions per request.
219 |     :param min_start_date: If no existing files are present in `output_dir`, a
220 |         binary search is done starting from this date to determine the first
221 |         date for which the server returns a valid response. If this search turns
222 |         up zero transactions, then nothing is saved for this account.
223 |     :param always_save: When a new OFX file is downloaded that contains an 
224 |         end-date that matches a previously downloaded file's end-date, this flag
225 |         determines if the new file should be saved or not. By not saving it,
226 |         some transactions that occur later in the day could be missed (until
227 |         additional transactions arrive on later days and they get included in
228 |         the next download). By always saving the file, superfluous files could
229 |         be created.
230 |     """
231 | 
232 |     if not os.path.exists(output_dir):
233 |         os.makedirs(output_dir)
234 |     date_format = '%Y%m%d'
235 | 
236 |     date_ranges = []
237 | 
238 |     # Read all OFX files in output directory.
239 |     for name in os.listdir(output_dir):
240 |         match = re.match(r'^([0-9]{8})-([0-9]{8})--([0-9]+)\.ofx', name)
241 |         if match is not None:
242 |             start_date = datetime.datetime.strptime(
243 |                 match.group(1), date_format).date()
244 |             end_date = datetime.datetime.strptime(match.group(2),
245 |                                                   date_format).date()
246 |             if start_date > end_date:
247 |                 logger.warning('Invalid filename: %r',
248 |                                os.path.join(output_dir, name))
249 |                 continue
250 |             date_ranges.append((start_date, end_date))
251 |     date_ranges.sort()
252 | 
253 |     def save_data(date_range, data):
254 |         t = time.time()
255 |         logger.info('Received data %s -- %s', date_range[0], date_range[1])
256 |         filename = ('%s-%s--%d.ofx' % (date_range[0].strftime(date_format),
257 |                                        date_range[1].strftime(date_format), t))
258 |         with atomic_write(os.path.join(output_dir, filename), mode='wb') as f:
259 |             f.write(data)
260 |         date_ranges.append((date_range[0].date(), date_range[1].date()))
261 |         date_ranges.sort()
262 | 
263 |     if len(date_ranges) == 0:
264 |         try:
265 |             date_range, data = get_earliest_data(account,
266 |                                                  start_date=min_start_date)
267 |         except RuntimeError as error:
268 |             logger.warning(error)
269 |             return
270 | 
271 |         save_data(date_range, data)
272 | 
273 |     def retrieve_more():
274 |         # Find next gap
275 |         cur_range = None
276 |         for i, cur_range in enumerate(date_ranges):
277 |             if (i + 1 < len(date_ranges) and
278 |                     cur_range[1] > date_ranges[i + 1][0]):
279 |                 # If end date of current range is greater than start date of
280 |                 # next range, then there is no gap.
281 |                 continue
282 |             break
283 |         data = download_account_data_starting_from(
284 |             account, cur_range[1] - datetime.timedelta(days=overlap_days))
285 |         date_range = get_ofx_date_range(data)
286 |         if date_range is None:
287 |             logger.warning('Failed to retrieve newer data for account %s',
288 |                            account.number)
289 |             return False
290 |         if (date_range[1].date() - cur_range[1]).days == 0:
291 |             if always_save:
292 |                 save_data(date_range, data)
293 |             return False
294 |         save_data(date_range, data)
295 |         return True
296 | 
297 |     while True:
298 |         if not retrieve_more():
299 |             break
300 |         if (datetime.date.today() - date_ranges[-1][0]
301 |             ).days <= min_days_retrieved:
302 |             break
303 | 
304 | 
305 | def save_all_account_data(inst: ofxclient.institution.Institution,
306 |                           output_dir: str, **kwargs):
307 |     """Attempts to download data for all accounts.
308 | 
309 |     :param inst: The institution connection.
310 |     :param output_dir: The base output directory in which to store the
311 |         downloaded OFX files.  The data for each account is saved in a
312 |         subdirectory of `output_dir`, with a name equal to the account number.
313 |     :param kwargs: Additional arguments to pass to save_single_account_data.
314 |     """
315 |     accounts = inst.accounts()
316 |     for a in accounts:
317 |         try:
318 |             name = sanitize_account_name(a.number)
319 |         except ValueError:
320 |             logger.warning('Account number is invalid path component: %r',
321 |                            name)
322 |             continue
323 |         save_single_account_data(
324 |             account=a, output_dir=os.path.join(output_dir, name), **kwargs)
325 | 
326 | 
327 | def connect(params: dict) -> ofxclient.institution.Institution:
328 |     """Connects to an OFX server.
329 | 
330 |     :param params: A dict containing the following string fields:
331 | 
332 |             - id: FI Id (see ofxhome.com)
333 | 
334 |             - org: FI Org (see ofxhome.com)
335 | 
336 |             - url: FI Url (see ofxhome.com)
337 | 
338 |             - broker_id: Optional.  FI Broker Id (see ofxhome.com)
339 | 
340 |             - username: Your username
341 | 
342 |             - password: Your password
343 | 
344 |     :returns: A connected ofxclient.institution.Institution object.
345 |     """
346 |     inst = ofxclient.institution.Institution(**params)
347 |     inst.authenticate()
348 |     return inst
349 | 
350 | 
351 | def run(ofx_params, output_directory, headless=False, **kwargs):
352 |     """Download non-interactively."""
353 |     del headless
354 |     inst = connect(ofx_params)
355 |     save_all_account_data(inst, output_directory, **kwargs)
356 | 
357 | 
358 | @contextlib.contextmanager
359 | def interactive(ofx_params, output_directory, headless=False):
360 |     """Returns variables for interactive session."""
361 |     del headless
362 |     yield dict(
363 |         ofx_params=ofx_params,
364 |         output_directory=output_directory,
365 |         inst=connect(ofx_params),
366 |     )
367 | 


--------------------------------------------------------------------------------
/finance_dl/scrape_lib.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import os
  3 | import time
  4 | import tempfile
  5 | import shutil
  6 | import seleniumrequests
  7 | 
  8 | from selenium import webdriver
  9 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 10 | from selenium.webdriver.support.ui import WebDriverWait, Select
 11 | from selenium.webdriver.support import expected_conditions
 12 | import signal
 13 | 
 14 | from selenium.webdriver.remote.webdriver import WebDriver
 15 | 
 16 | from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
 17 | from selenium.webdriver.common.by import By
 18 | from selenium.webdriver.common.keys import Keys
 19 | 
 20 | 
 21 | def all_conditions(*conditions):
 22 |     return lambda driver: all(condition(driver) for condition in conditions)
 23 | 
 24 | 
 25 | def extract_table_data(table, header_names, single_header=False):
 26 |     rows = table.find_elements_by_xpath('thead/tr | tbody/tr | tr')
 27 |     headers = []
 28 |     seen_data = False
 29 |     data = []
 30 |     for row in rows:
 31 |         cell_elements = row.find_elements_by_xpath('th | td')
 32 |         cell_values = [x.text.strip() for x in cell_elements]
 33 |         is_header_values = [x in header_names for x in cell_values if x]
 34 |         if len(is_header_values) == 0:
 35 |             is_header = True
 36 |         else:
 37 |             if any(is_header_values) != all(is_header_values):
 38 |                 raise RuntimeError('Header mismatch: %r' % (list(
 39 |                     zip(is_header_values,
 40 |                         [x for x in cell_values if x]),
 41 |                 )))
 42 |             is_header = any(is_header_values)
 43 |         if is_header and (not seen_data or not single_header):
 44 |             if seen_data:
 45 |                 headers.clear()
 46 |             cur_header = dict()
 47 |             headers.append(cur_header)
 48 |             cur_col = 0
 49 |             for text, el in zip(cell_values, cell_elements):
 50 |                 colspan = el.get_attribute('colspan')
 51 |                 if colspan is None:
 52 |                     colspan = 1
 53 |                 else:
 54 |                     colspan = int(colspan)
 55 |                 for span in range(colspan):
 56 |                     if text:
 57 |                         cur_header[cur_col] = text
 58 |                     cur_col += 1
 59 |         else:
 60 |             seen_data = True
 61 |             cur_col = 0
 62 |             cur_data = []
 63 |             for text, el in zip(cell_values, cell_elements):
 64 |                 colspan = el.get_attribute('colspan')
 65 |                 if colspan is None:
 66 |                     colspan = 1
 67 |                 else:
 68 |                     colspan = int(colspan)
 69 |                 header_parts = []
 70 |                 for span in range(colspan):
 71 |                     for header in headers:
 72 |                         part = header.get(cur_col)
 73 |                         if part is not None:
 74 |                             header_parts.append(part)
 75 |                     cur_col += 1
 76 |                 if text:
 77 |                     cur_data.append((':'.join(header_parts), text))
 78 |             if cur_data:
 79 |                 data.append(cur_data)
 80 |     return data
 81 | 
 82 | 
 83 | def find_table_by_headers(scraper, headers):
 84 |     tables = None
 85 |     for header in headers:
 86 |         new_tables = scraper.find_visible_elements_by_descendant_partial_text(
 87 |             header, 'table')
 88 |         if tables is None:
 89 |             tables = set(new_tables)
 90 |         else:
 91 |             tables &= set(new_tables)
 92 |     return tables
 93 | 
 94 | 
 95 | # https://stackoverflow.com/questions/8344776/can-selenium-interact-with-an-existing-browser-session
 96 | def attach_to_session(executor_url, session_id):
 97 |     original_execute = WebDriver.execute
 98 | 
 99 |     def new_command_execute(self, command, params=None):
100 |         if command == "newSession":
101 |             # Mock the response
102 |             return {'success': 0, 'value': None, 'sessionId': session_id}
103 |         else:
104 |             return original_execute(self, command, params)
105 | 
106 |     # Patch the function before creating the driver object
107 |     WebDriver.execute = new_command_execute
108 |     driver = webdriver.Remote(command_executor=executor_url,
109 |                               desired_capabilities={})
110 |     driver.session_id = session_id
111 |     # Replace the patched function with original function
112 |     WebDriver.execute = original_execute
113 |     return driver
114 | 
115 | 
116 | def is_displayed(element):
117 |     """Returns `True` if `element` is displayed.
118 | 
119 |     Ignores StaleElementReferenceException.
120 |     """
121 | 
122 |     try:
123 |         return element.is_displayed()
124 |     except StaleElementReferenceException:
125 |         return False
126 | 
127 | 
128 | class Scraper(object):
129 |     def __init__(self, download_dir=None, connect=None, chromedriver_bin='finance-dl-chromedriver-wrapper',
130 |                  headless=True, use_seleniumrequests=False, session_id=None, profile_dir=None):
131 | 
132 |         self.download_dir = download_dir
133 | 
134 |         if connect is not None and session_id is not None:
135 |             print('Connecting to existing browser: %s %s' % (connect,
136 |                                                              session_id))
137 |             self.driver = attach_to_session(connect, session_id)
138 |             return
139 | 
140 |         original_sigint_handler = signal.getsignal(signal.SIGINT)
141 |         signal.signal(signal.SIGINT, signal.SIG_IGN)
142 | 
143 |         self.chromedriver_bin = chromedriver_bin
144 |         chrome_options = webdriver.ChromeOptions()
145 |         service_args = []
146 |         chrome_options.add_experimental_option('excludeSwitches', [
147 |             'enable-automation',
148 |             'load-extension',
149 |             'load-component-extension',
150 |             'ignore-certificate-errors',
151 |             'test-type',
152 |         ])
153 |         if profile_dir is not None:
154 |             chrome_options.add_argument('user-data-dir=%s' % profile_dir)
155 |             if not os.path.exists(profile_dir):
156 |                 os.makedirs(profile_dir)
157 |         prefs = {}
158 |         prefs['plugins.plugins_disabled'] = [
159 |             'Chrome PDF Viewer', 'Chromium PDF Viewer'
160 |         ]
161 |         prefs['plugins.always_open_pdf_externally'] = True
162 |         if download_dir is not None:
163 |             prefs['download.default_directory'] = download_dir
164 |         chrome_options.add_experimental_option('prefs', prefs)
165 |         if headless:
166 |             chrome_options.add_argument('headless')
167 |         if use_seleniumrequests:
168 |             driver_class = seleniumrequests.Chrome
169 |         else:
170 |             driver_class = webdriver.Chrome
171 |         self.driver = driver_class(
172 |             executable_path=self.chromedriver_bin,
173 |             chrome_options=chrome_options,
174 |             service_args=service_args,
175 |         )
176 |         print(' --connect=%s --session-id=%s' %
177 |               (self.driver.command_executor._url, self.driver.session_id))
178 |         signal.signal(signal.SIGINT, original_sigint_handler)
179 | 
180 |     def check_after_wait(self):
181 |         """Function called after each wait."""
182 |         pass
183 | 
184 |     def get_downloaded_file(self):
185 |         names = os.listdir(self.download_dir)
186 |         partial_names = []
187 |         other_names = []
188 |         for name in names:
189 |             if name.endswith('.part') or name.endswith('.crdownload'):
190 |                 partial_names.append(name)
191 |             else:
192 |                 other_names.append(name)
193 |         if len(other_names) == 0:
194 |             return None
195 |         if len(other_names) > 1:
196 |             raise RuntimeError(
197 |                 'More than one downloaded file: %r' % other_names)
198 |         # if len(partial_names) > 0:
199 |         #     raise RuntimeError('Partial download files remain: %r' % partial_names)
200 |         path = os.path.join(self.download_dir, other_names[0])
201 |         with open(path, 'rb') as f:
202 |             data = f.read()
203 |         if len(data) == 0:
204 |             return None
205 |         os.remove(path)
206 |         return other_names[0], data
207 | 
208 |     # See http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html
209 |     @contextlib.contextmanager
210 |     def wait_for_page_load(self, timeout=30):
211 |         old_page = self.driver.find_element_by_tag_name('html')
212 |         yield
213 |         WebDriverWait(self.driver, timeout).until(
214 |             expected_conditions.staleness_of(old_page),
215 |             message='waiting for page to load')
216 |         self.check_after_wait()
217 | 
218 |     @contextlib.contextmanager
219 |     def wait_for_new_url(self, timeout=30):
220 |         old_url = self.driver.current_url
221 |         yield
222 | 
223 |         def is_new_url():
224 |             if self.driver.current_url != old_url:
225 |                 return True
226 |             raise NoSuchElementException
227 | 
228 |         self.wait_and_return(is_new_url)
229 | 
230 |     def wait_and_return(self, *conditions, timeout=30,
231 |                         message='Waiting to match conditions'):
232 |         results = [None]
233 | 
234 |         def predicate(driver):
235 |             results[0] = tuple(condition() for condition in conditions)
236 |             return all(results[0])
237 | 
238 |         WebDriverWait(self.driver, timeout).until(predicate, message=message)
239 |         self.check_after_wait()
240 |         return results[0]
241 | 
242 |     def wait_and_locate(self, *locators, timeout=30, only_displayed=False):
243 |         conditions = []
244 |         for locator in locators:
245 | 
246 |             def condition(locator=locator):
247 |                 element = self.driver.find_element(*locator)
248 |                 if only_displayed:
249 |                     if not is_displayed(element):
250 |                         raise NoSuchElementException
251 |                 return element
252 | 
253 |             conditions.append(condition)
254 |         return self.wait_and_return(
255 |             *conditions, timeout=timeout,
256 |             message='Waiting to locate %r' % (locators, ))
257 | 
258 |     def for_each_frame(self):
259 |         self.driver.switch_to.default_content()
260 | 
261 |         seen_ids = set()
262 |         def helper(nesting_level=0):
263 |             def handle_frames(frames):
264 |                 frames = [f for f in frames if f.id not in seen_ids]
265 |                 seen_ids.update(f.id for f in frames)
266 |                 for frame in frames:
267 |                     self.driver.switch_to.frame(frame)
268 |                     yield from helper(nesting_level=nesting_level + 1)
269 |                     self.driver.switch_to.parent_frame()
270 |             yield
271 |             for element_name in ['frame', 'iframe']:
272 |                 try:
273 |                     other_frames = self.find_visible_elements(
274 |                         By.TAG_NAME, element_name)
275 |                     yield from handle_frames(other_frames)
276 |                 except:
277 |                     pass
278 | 
279 |         yield from helper()
280 | 
281 |     def find_elements_in_any_frame(self, by_method, locator, predicate=None,
282 |                                    only_displayed=False):
283 |         for frame in self.for_each_frame():
284 |             try:
285 |                 for element in self.driver.find_elements(by_method, locator):
286 |                     if only_displayed:
287 |                         try:
288 |                             if not is_displayed(element):
289 |                                 continue
290 |                         except:
291 |                             import traceback
292 |                             traceback.print_exc()
293 |                             continue
294 |                     if predicate is None or predicate(element):
295 |                         yield element
296 |             except NoSuchElementException:
297 |                 pass
298 | 
299 |     def find_element_in_any_frame(self, by_method, locator, **kwargs):
300 |         for element in self.find_elements_in_any_frame(by_method, locator,
301 |                                                        **kwargs):
302 |             return element
303 |         raise NoSuchElementException
304 | 
305 |     def interact(self, global_vars, local_vars):
306 |         import IPython
307 |         # ipshell = InteractiveShellEmbed(banner1='', exit_msg='')
308 |         # ipshell.extension_manager.load_extension('autoreload')
309 |         # ipshell.run_line_magic('autoreload', '2')
310 |         # ipshell.autoindent = False
311 |         ns = global_vars.copy()
312 |         ns.update(local_vars)
313 |         ns['self'] = self
314 |         IPython.terminal.ipapp.launch_new_instance(argv=[], user_ns=ns)
315 |         # ipshell(local_ns=ns)
316 |         # vars = global_vars.copy()
317 |         # vars.update(local_vars)
318 |         # shell = code.InteractiveConsole(vars)
319 |         # shell.interact()
320 | 
321 |     def find_username_and_password(self):
322 |         passwords = self.driver.find_elements(By.XPATH,
323 |                                               '//input[@type="password"]')
324 |         passwords = [x for x in passwords if is_displayed(x)]
325 |         if len(passwords) == 0:
326 |             raise NoSuchElementException()
327 |         password = passwords[0]
328 |         username = password.find_elements(
329 |             By.XPATH, 'preceding::input[@type="text" or @type="email"]')[-1]
330 |         if not is_displayed(username):
331 |             raise NoSuchElementException()
332 |         return username, password
333 | 
334 |     def find_username_and_password_in_any_frame(self):
335 |         for frame in self.for_each_frame():
336 |             try:
337 |                 return self.find_username_and_password()
338 |             except NoSuchElementException:
339 |                 pass
340 |         raise NoSuchElementException()
341 | 
342 |     def find_visible_elements_by_descendant_partial_text(
343 |             self, text, element_name):
344 |         return self.find_elements_by_descendant_partial_text(
345 |             text, element_name, only_displayed=True)
346 | 
347 |     def find_elements_by_descendant_partial_text(self, text, element_name,
348 |                                                  only_displayed=False):
349 |         all_elements = self.driver.find_elements_by_xpath(
350 |             "//text()[contains(.,%r)]/ancestor::*[self::%s][1]" %
351 |             (text, element_name))
352 |         if only_displayed:
353 |             return [x for x in all_elements if is_displayed(x)]
354 |         return all_elements
355 | 
356 |     def find_elements_by_descendant_text_match(self, text_match, element_name,
357 |                                                only_displayed=False):
358 |         all_elements = self.driver.find_elements_by_xpath(
359 |             "//text()[%s]/ancestor::*[self::%s][1]" % (text_match,
360 |                                                        element_name))
361 |         if only_displayed:
362 |             return [x for x in all_elements if is_displayed(x)]
363 |         return all_elements
364 | 
365 |     def find_visible_elements_by_partial_text(self, text, element_name):
366 |         all_elements = self.driver.find_elements_by_xpath(
367 |             "//%s[contains(.,%r)]" % (element_name, text))
368 |         return [x for x in all_elements if is_displayed(x)]
369 | 
370 |     def find_visible_elements(self, by_method, locator):
371 |         elements = self.driver.find_elements(by_method, locator)
372 |         return [x for x in elements if is_displayed(x)]
373 | 
374 |     def click(self, link):
375 |         self.driver.execute_script('arguments[0].scrollIntoView(true);', link)
376 |         link.click()
377 | 
378 | 
379 | @contextlib.contextmanager
380 | def temp_scraper(scraper_type, *args, headless=True, connect=None,
381 |                  session_id=None, **kwargs):
382 |     download_dir = tempfile.mkdtemp()
383 |     try:
384 |         scraper = scraper_type(*args, download_dir=download_dir,
385 |                                connect=connect, session_id=session_id,
386 |                                headless=headless, **kwargs)
387 |         try:
388 |             yield scraper
389 |         finally:
390 |             if connect is None:
391 |                 try:
392 |                     scraper.driver.quit()
393 |                 except Exception as e:
394 |                     print('Error quitting driver: %r' % e)
395 |     finally:
396 |         shutil.rmtree(download_dir)
397 | 
398 | 
399 | def retry(func, num_tries=3, retry_delay=0):
400 |     while True:
401 |         try:
402 |             return func()
403 |         except Exception as e:
404 |             import traceback
405 |             traceback.print_exc()
406 |             num_tries -= 1
407 |             if num_tries <= 0:
408 |                 raise
409 |         print('Waiting %g seconds before retrying' % (retry_delay, ))
410 |         time.sleep(retry_delay)
411 | 
412 | 
413 | def run_with_scraper(scraper_class, **kwargs):
414 |     first_call = True
415 | 
416 |     def fetch():
417 |         nonlocal first_call
418 |         if not first_call:
419 |             kwargs['headless'] = False
420 |         first_call = False
421 |         with temp_scraper(scraper_class, **kwargs) as scraper:
422 |             scraper.run()
423 | 
424 |     retry(fetch)
425 | 
426 | 
427 | @contextlib.contextmanager
428 | def interact_with_scraper(scraper_class, **kwargs):
429 |     with temp_scraper(scraper_class, **kwargs) as scraper:
430 |         yield dict(
431 |             scraper=scraper,
432 |             self=scraper,
433 |             By=By,
434 |             Select=Select,
435 |             Keys=Keys,
436 |         )
437 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/finance_dl/mint.py:
--------------------------------------------------------------------------------
  1 | """Downloads Mint.com transactions and balance data.
  2 | 
  3 | This uses the `mintapi` Python package in conjunction with the `selenium` Python
  4 | package and `chromedriver` to scrape the Mint.com website.
  5 | 
  6 | Configuration:
  7 | ==============
  8 | 
  9 | The following keys may be specified as part of the configuration dict:
 10 | 
 11 | - `credentials`: Required.  Must be a `dict` with `'username'` and `'password'`
 12 |   keys.
 13 | 
 14 | - `output_directory`: Required.  Must be a `str` that specifies the path on the
 15 |   local filesystem where the output will be written.  If the directory does not
 16 |   exist, it will be created.
 17 | 
 18 | - `profile_dir`: Optional.  If specified, must be a `str` that specifies the
 19 |   path to a persistent Chrome browser profile to use.  This should be a path
 20 |   used solely for this single configuration; it should not refer to your normal
 21 |   browser profile.  If not specified, a fresh temporary profile will be used
 22 |   each time.  It is highly recommended to specify a `profile_dir` to avoid
 23 |   having to manually enter a multi-factor authentication code each time.
 24 | 
 25 | - `merge_files`: Optional.  If specified, must be a list of `str` values that
 26 |   specify the paths to additional CSV files containing transactions in the same
 27 |   format as the `mint.csv` output file.  These files are merged with the
 28 |   contents of `mint.csv` into a new file `mint-merged.csv` in the specified
 29 |   `output_directory`.
 30 | 
 31 | - `skip_refresh`: Optional.  Defaults to `False`.  A value of `True` indicates
 32 |   not to wait until all account data has been refreshed.
 33 | 
 34 | Output format:
 35 | ==============
 36 | 
 37 | The transactions are saved to a file named `mint.csv` under the specified output
 38 | directory.  Balance information is saved to files named
 39 | `balances.%Y-%m-%dT%H%M%S%z.csv` under the specified output directory.
 40 | 
 41 | Duplicate transactions are excluded from the merged file, in the following way:
 42 | since the Mint CSV format lacks any sort of unique transaction identifier,
 43 | multiple legitimate transactions may produce identical lines in the CSV file.
 44 | Therefore, for each unique CSV line, considering only the 'Date', 'Original
 45 | Description', 'Amount', 'Transaction Type', and 'Account Name' fields, the
 46 | merged file contains N copies of this line, where N is the maximum number of
 47 | times this line occurs in any of the input CSV files.
 48 | 
 49 | Example:
 50 | ========
 51 | 
 52 |     def CONFIG_mint():
 53 |         return dict(
 54 |             module='finance_dl.mint',
 55 |             credentials={
 56 |                 'username': 'XXXXXX',
 57 |                 'password': 'XXXXXX',
 58 |             },
 59 |             output_directory=os.path.join(data_dir, 'mint'),
 60 |             # profile_dir is optional, but highly recommended to avoid having to
 61 |             # enter multi-factor authentication code each time.
 62 |             profile_dir=os.path.join(profile_dir, 'mint'),
 63 |         )
 64 | 
 65 | Interactive shell:
 66 | ==================
 67 | 
 68 | From the interactive shell, type:
 69 | 
 70 |     run(output_directory=output_directory, profile_dir=profile_dir,
 71 |         credentials=credentials)
 72 | 
 73 | to run the scraper.
 74 | 
 75 | """
 76 | 
 77 | import os
 78 | from typing import Sequence, Optional, Dict
 79 | import dateutil.parser
 80 | import io
 81 | import csv
 82 | import re
 83 | import contextlib
 84 | import collections
 85 | import urllib.parse
 86 | import datetime
 87 | import time
 88 | import json
 89 | import logging
 90 | import traceback
 91 | from selenium.webdriver.common.by import By
 92 | from selenium.webdriver.common.keys import Keys
 93 | import selenium.common.exceptions
 94 | 
 95 | from . import csv_merge
 96 | from . import scrape_lib
 97 | 
 98 | if False:
 99 |     from mintapi import Mint  # for typing only
100 | 
101 | logger = logging.getLogger('mint')
102 | 
103 | netloc_re = r'^([^\.@]+\.)*(mint|intuit).com$'
104 | 
105 | 
106 | def check_url(url):
107 |     result = urllib.parse.urlparse(url)
108 |     if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
109 |         raise RuntimeError('Reached invalid URL: %r' % url)
110 | 
111 | 
112 | class MintTokenScraper(scrape_lib.Scraper):
113 |     def __init__(self, credentials, login_timeout=30, **kwargs):
114 |         super().__init__(use_seleniumrequests=True, **kwargs)
115 |         self.credentials = credentials
116 |         self.login_timeout = login_timeout
117 | 
118 |     def login(self):
119 |         logger.info('Logging into mint')
120 |         self.driver.get(
121 |             "https://accounts.intuit.com/index.html?offering_id=Intuit.ifs.mint&namespace_id=50000026&redirect_url=https://mint.intuit.com/overview.event"
122 |         )
123 |         logger.info('Waiting to enter username and password')
124 |         (username, password), = self.wait_and_return(
125 |             self.find_username_and_password_in_any_frame)
126 |         logger.info('Entering username and password')
127 |         username.send_keys(self.credentials['username'])
128 |         password.send_keys(self.credentials['password'])
129 |         password.send_keys(Keys.ENTER)
130 |         start_time = time.time()
131 |         while not self.driver.current_url.startswith(
132 |                 'https://mint.intuit.com/overview.event'):
133 |             logger.info('Waiting for MFA')
134 |             time.sleep(1)
135 |             cur_time = time.time()
136 |             if self.login_timeout is not None and cur_time > start_time + self.login_timeout:
137 |                 raise TimeoutError('Login failed to complete within timeout')
138 | 
139 |         while True:
140 |             token_element, = self.wait_and_locate((By.NAME, 'javascript-user'))
141 |             value_json = token_element.get_attribute('value')
142 |             logger.info('scraped user data: %r', value_json)
143 |             try:
144 |                 value = json.loads(value_json)
145 |                 if isinstance(value, dict) and 'token' in value:
146 |                     break
147 |             except ValueError:
148 |                 pass
149 |             logger.info('Waiting for token')
150 |             time.sleep(1)
151 |             cur_time = time.time()
152 |             if self.login_timeout is not None and cur_time > start_time + self.login_timeout:
153 |                 raise TimeoutError('Login failed to complete within timeout')
154 | 
155 | 
156 | @contextlib.contextmanager
157 | def connect(credentials, scraper_args=None):
158 |     import mintapi
159 |     mint = mintapi.Mint()
160 |     scraper_args = dict(scraper_args or {})
161 | 
162 |     def try_login(scraper):
163 |         scraper = MintTokenScraper(credentials=credentials, **scraper_args)
164 |         scraper.login()
165 |         mint.driver = scraper.driver
166 |         mint.token = mint.get_token()
167 | 
168 |     with scrape_lib.temp_scraper(MintTokenScraper, credentials=credentials,
169 |                                  **scraper_args) as scraper:
170 |         okay = False
171 |         try:
172 |             try_login(scraper)
173 |             okay = True
174 |         except (TimeoutError, selenium.common.exceptions.TimeoutException):
175 |             if not scraper_args.get('headless') and not scraper_args.get(
176 |                     'login_timeout'):
177 |                 raise
178 |             traceback.print_exc()
179 |         if okay:
180 |             yield mint
181 |             return
182 |     scraper_args['headless'] = True
183 |     scraper_args['login_timeout'] = None
184 |     logger.info('Retrying login interactively')
185 |     with scrape_lib.temp_scraper(MintTokenScraper, credentials=credentials,
186 |                                  **scraper_args) as scraper:
187 |         try_login(scraper)
188 |         yield mint
189 | 
190 | 
191 | def match_csv_to_json(csv_entry: dict, json_entry: dict):
192 |     json_date = dateutil.parser.parse(json_entry['date'])
193 |     json_csv_entry = {
194 |         'Date':
195 |         '%d/%02d/%d' % (json_date.month, json_date.day, json_date.year),
196 |         'Original Description':
197 |         json_entry['omerchant'],
198 |         'Amount':
199 |         json_entry['amount'].translate({
200 |             ord('$'): None,
201 |             ord(','): None
202 |         }),
203 |         'Transaction Type':
204 |         'debit' if json_entry['isDebit'] else 'credit',
205 |         'Account Name':
206 |         json_entry['account'],
207 |     }
208 |     csv_entry = csv_entry.copy()
209 |     csv_entry.pop('Category', None)
210 |     csv_entry.pop('Description', None)
211 |     csv_entry.pop('Labels', None)
212 |     csv_entry.pop('Notes', None)
213 |     if csv_entry != json_csv_entry:
214 |         raise RuntimeError('CSV entry %r does not match JSON entry %r' %
215 |                            (csv_entry, json_csv_entry))
216 | 
217 | 
218 | def get_annotated_transactions(mint: 'Mint', num_attempts: int = 3):
219 |     for attempt_num in range(num_attempts):
220 |         try:
221 |             logger.info('Getting CSV transactions')
222 |             csv_data = mint.get_transactions_csv(
223 |                 include_investment=True).decode()
224 |             if len(csv_data) == 0:
225 |                 raise RuntimeError('Received empty Mint data')
226 | 
227 |             logger.info('Getting JSON transactions')
228 |             json_data = mint.get_transactions_json(include_investment=True)
229 | 
230 |             reader = csv.DictReader(io.StringIO(csv_data, newline=''))
231 |             csv_rows = list(reader)
232 | 
233 |             if len(csv_rows) != len(json_data):
234 |                 raise RuntimeError('CSV data does not match JSON data')
235 | 
236 |             for csv_entry, json_entry in zip(csv_rows, json_data):
237 |                 match_csv_to_json(csv_entry, json_entry)
238 |             break
239 |         except:
240 |             if attempt_num + 1 == num_attempts:
241 |                 raise
242 |     return (reader.fieldnames, list(zip(csv_rows, json_data)))
243 | 
244 | 
245 | def refresh_mint_data(mint: 'Mint'):
246 |     logger.info('Initiating account refresh')
247 |     mint.initiate_account_refresh()
248 |     # Wait for downloading to be complete
249 |     logger.info('Waiting for accounts to update')
250 |     polling_interval_seconds = 5
251 |     start_time = time.time()
252 |     while True:
253 |         time.sleep(polling_interval_seconds)
254 |         accounts = mint.get_accounts()
255 |         cur_time = time.time()
256 |         pending = []
257 |         ok = []
258 |         other = []
259 |         for account in accounts:
260 |             status = account['fiLoginStatus']
261 |             if status in ['DOWNLOADING_IN_PROGRESS', 'REFRESH_REQUESTED']:
262 |                 pending.append(account)
263 |             elif status == 'OK':
264 |                 ok.append(account)
265 |             else:
266 |                 other.append(account)
267 |         if len(pending) == 0:
268 |             break
269 |         logger.info('[%d seconds] Still downloading: %s',
270 |                     cur_time - start_time, ' '.join(
271 |                         '%r' % account['name'] for account in pending))
272 |     cur_time = time.time()
273 |     logger.info('[%d seconds] Finished updating' % (cur_time - start_time))
274 |     for account in other:
275 |         logger.info('Account %r in state %r', account['name'],
276 |                     account['fiLoginStatus'])
277 | 
278 | 
279 | mint_date_format = '%m/%d/%Y'
280 | 
281 | 
282 | def get_mint_date(row: dict):
283 |     date = datetime.datetime.strptime(row['Date'], mint_date_format).date()
284 |     return date
285 | 
286 | 
287 | def download_mint_data(mint: 'Mint'):
288 |     fieldnames, entries = get_annotated_transactions(mint)
289 |     non_pending_txns = [
290 |         csv_row for csv_row, json_row in entries
291 |         if not json_row['isPending'] and not json_row['isDuplicate']
292 |     ]
293 | 
294 |     balances = []
295 |     account_max_transaction_date = dict()  # type: Dict[str, datetime.date]
296 |     for csv_row in non_pending_txns:
297 |         date = get_mint_date(csv_row)
298 |         account = csv_row['Account Name']
299 |         prev_date = account_max_transaction_date.get(account)
300 |         if prev_date is None or prev_date < date:
301 |             account_max_transaction_date[account] = date
302 | 
303 |     account_data = mint.get_accounts()
304 |     for account in account_data:
305 |         account_name = account['name']
306 |         max_date = account_max_transaction_date.get(account_name)
307 |         max_date_str = (max_date.strftime(mint_date_format)
308 |                         if max_date is not None else '')
309 |         balance = account.get('currentBalance', '')
310 |         if account['accountType'] == 'credit':
311 |             # Mint negates credit card balances.
312 |             balance = -balance
313 |         balances.append({
314 |             'Name': account_name,
315 |             'Currency': account.get('currency', ''),
316 |             'Balance': str(balance),
317 |             'Last Updated': str(account.get('lastUpdated', '')),
318 |             'State': account.get('fiLoginStatus', ''),
319 |             'Last Transaction': max_date_str,
320 |         })
321 | 
322 |     new_csv = io.StringIO(newline='')
323 |     new_csv_data = csv.DictWriter(new_csv, fieldnames=fieldnames,
324 |                                   lineterminator='\n', quoting=csv.QUOTE_ALL)
325 |     new_csv_data.writeheader()
326 |     new_csv_data.writerows(non_pending_txns)
327 | 
328 |     csv_data = new_csv.getvalue()
329 |     return csv_data, balances
330 | 
331 | 
332 | def merge_mint_data(mint_data_list: Sequence[str]):
333 |     fieldnames = None
334 |     merged_counter = collections.Counter()  # type: Dict[tuple, int]
335 |     merged_rows = []
336 |     keep_fields = [
337 |         'Date', 'Original Description', 'Amount', 'Transaction Type',
338 |         'Account Name'
339 |     ]
340 | 
341 |     def convert_row(row) -> tuple:
342 |         return tuple(row[field] for field in keep_fields)
343 | 
344 |     for csv_data in mint_data_list:
345 |         cur_counter = collections.Counter()  # type: Dict[tuple, int]
346 |         reader = csv.DictReader(io.StringIO(csv_data, newline=''))
347 |         if fieldnames is None:
348 |             fieldnames = reader.fieldnames
349 |         else:
350 |             assert fieldnames == reader.fieldnames, (fieldnames,
351 |                                                      reader.fieldnames)
352 |         rows = list(reader)
353 |         for row in rows:
354 |             converted_row = convert_row(row)
355 |             cur_counter[converted_row] += 1
356 |             if cur_counter[converted_row] > merged_counter[converted_row]:
357 |                 merged_rows.append(row)
358 |                 merged_counter[converted_row] += 1
359 | 
360 |     merged_rows.sort(key=get_mint_date, reverse=True)
361 | 
362 |     assert fieldnames is not None
363 | 
364 |     new_csv = io.StringIO(newline='')
365 |     new_csv_data = csv.DictWriter(new_csv, fieldnames=fieldnames,
366 |                                   lineterminator='\n', quoting=csv.QUOTE_ALL)
367 |     new_csv_data.writeheader()
368 |     new_csv_data.writerows(merged_rows)
369 | 
370 |     csv_data = new_csv.getvalue()
371 |     return csv_data
372 | 
373 | 
374 | def merge_mint_files(input_paths: Sequence[str], output_path: str):
375 |     mint_data_list = []
376 |     for filename in input_paths:
377 |         with open(filename, 'r', encoding='utf-8', newline='') as f:
378 |             mint_data_list.append(f.read())
379 |     csv_data = merge_mint_data(mint_data_list)
380 |     with open(output_path, 'w', encoding='utf-8', newline='') as f:
381 |         f.write(csv_data)
382 | 
383 | 
384 | def verify_mint_update_consistency(csv_data: str, existing_filename: str,
385 |                                    allow_missing: bool = False):
386 |     unchanged = False
387 | 
388 |     if os.path.exists(existing_filename):
389 |         missing = False
390 |         with open(existing_filename, 'r', encoding='utf-8', newline='') as f:
391 |             old_data = f.read()
392 | 
393 |         def get_rows(data):
394 |             reader = csv.DictReader(io.StringIO(csv_data, newline=''))
395 |             csv_rows = list(reader)
396 |             keep_fields = [
397 |                 'Date', 'Original Description', 'Amount', 'Transaction Type',
398 |                 'Account Name'
399 |             ]
400 | 
401 |             def convert_row(row):
402 |                 return tuple(row[field] for field in keep_fields)
403 | 
404 |             return list(map(convert_row, csv_rows))
405 | 
406 |         if old_data == csv_data:
407 |             unchanged = True
408 |         else:
409 |             old_rows = get_rows(old_data)
410 |             old_counter = collections.Counter(old_rows)
411 |             new_rows = get_rows(csv_data)
412 |             new_counter = collections.Counter(new_rows)
413 | 
414 |             for k in old_rows:
415 |                 if old_counter[k] > new_counter[k]:
416 |                     logger.warning('New file missing entry: %s', k)
417 |                     missing = True
418 |             if missing and not allow_missing:
419 |                 raise RuntimeError('New file is missing some existing entries')
420 |     if not unchanged:
421 |         with open(existing_filename, 'w', encoding='utf-8', newline='') as f:
422 |             f.write(csv_data)
423 | 
424 | 
425 | def fetch_mint_data(credentials: dict, existing_filename: str,
426 |                     new_filename: str, balances_output_prefix: str,
427 |                     skip_refresh: bool = False, skip_download: bool = False,
428 |                     allow_missing: bool = False, **kwargs):
429 |     if new_filename == existing_filename:
430 |         raise ValueError('new_filename must not equal existing_filename')
431 |     if skip_download:
432 |         with open(new_filename, 'r', encoding='utf-8', newline='') as f:
433 |             csv_data = f.read()
434 |     else:
435 |         with connect(credentials, kwargs) as mint:
436 |             if not skip_refresh:
437 |                 refresh_mint_data(mint)
438 |             csv_data, balances = download_mint_data(mint)
439 |         with open(new_filename, 'w', encoding='utf-8', newline='') as f:
440 |             f.write(csv_data)
441 | 
442 |         balances_path = balances_output_prefix + time.strftime(
443 |             '%Y-%m-%dT%H%M%S%z') + '.csv'
444 |         csv_merge.write_csv([
445 |             'Name', 'Currency', 'Balance', 'Last Updated', 'State',
446 |             'Last Transaction'
447 |         ], balances, balances_path)
448 |         logger.info('Writing balances to: %s', balances_path)
449 | 
450 |     verify_mint_update_consistency(csv_data=csv_data,
451 |                                    existing_filename=existing_filename,
452 |                                    allow_missing=allow_missing)
453 | 
454 | 
455 | def run(output_directory: str, merge_files: Sequence[str] = (), **kwargs):
456 |     if not os.path.exists(output_directory):
457 |         os.makedirs(output_directory)
458 |     existing_filename = os.path.join(output_directory, 'mint.csv')
459 |     new_filename = os.path.join(output_directory, 'mint.csv.new')
460 |     balances_output_prefix = os.path.join(output_directory, 'balances.')
461 |     fetch_mint_data(existing_filename=existing_filename,
462 |                     new_filename=new_filename,
463 |                     balances_output_prefix=balances_output_prefix, **kwargs)
464 |     if merge_files:
465 |         merged_filename = os.path.join(output_directory, 'mint-merged.csv')
466 |         merge_mint_files([existing_filename] + list(merge_files),
467 |                          merged_filename)
468 |         logger.info('Saved merged transactions to: %s', merged_filename)
469 | 
470 | 
471 | @contextlib.contextmanager
472 | def interactive(**kwargs):
473 |     with connect(kwargs['credentials'],
474 |                  dict(profile_dir=kwargs.get('profile_dir'))) as mint:
475 |         kwargs['mint'] = mint
476 |         yield kwargs
477 | 


--------------------------------------------------------------------------------