├── finance_dl ├── __init__.py ├── chromedriver_wrapper.py ├── ofx_rename.py ├── csv_merge.py ├── google_login.py ├── cli.py ├── google_takeout.py ├── ebmud.py ├── google_purchases.py ├── anthem.py ├── pge.py ├── comcast.py ├── update.py ├── stockplanconnect.py ├── waveapps.py ├── ultipro_google.py ├── paypal.py ├── amazon.py ├── venmo.py ├── healthequity.py ├── ofx.py ├── scrape_lib.py └── mint.py ├── .gitignore ├── mypy.ini ├── .travis.yml ├── tox.ini ├── .bumpversion.cfg ├── tests └── test_ofx.py ├── setup.py ├── .style.yapf ├── README.md ├── example_finance_dl_config.py └── LICENSE /finance_dl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | .mypy_cache 4 | .tox 5 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | warn_unused_configs = True 3 | ignore_missing_imports = True 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 4 | - "3.5" 5 | - "3.6" 6 | install: pip install tox-travis 7 | script: tox 8 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py37 4 | 5 | [testenv] 6 | deps = 7 | mypy 8 | 9 | commands = 10 | pytest . 11 | mypy finance_dl 12 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.3.2 3 | tag = True 4 | commit = True 5 | message = chore: update package version to {new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | search = version='{current_version}' 9 | replace = version='{new_version}' 10 | 11 | -------------------------------------------------------------------------------- /finance_dl/chromedriver_wrapper.py: -------------------------------------------------------------------------------- 1 | """Runs chromedriver in a new process group. 2 | 3 | This prevents it from being killed when typing Control+c in an interactive 4 | shell. 5 | """ 6 | 7 | import os 8 | import sys 9 | import chromedriver_binary 10 | 11 | 12 | def main(): 13 | 14 | try: 15 | os.setpgrp() 16 | except: 17 | # os.setpgrp not available on Windows 18 | pass 19 | 20 | executable_path = os.getenv('ACTUAL_CHROMEDRIVER_PATH', 'chromedriver') 21 | os.execvp(executable_path, [executable_path] + sys.argv[1:]) 22 | -------------------------------------------------------------------------------- /tests/test_ofx.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from finance_dl.ofx import sanitize_account_name 4 | 5 | 6 | def test_sanitize_account_name_disallows_dot(): 7 | with pytest.raises(ValueError): 8 | sanitize_account_name('.') 9 | 10 | 11 | def test_sanitize_account_name_disallows_double_dot(): 12 | with pytest.raises(ValueError): 13 | sanitize_account_name('..') 14 | 15 | 16 | def test_sanitize_account_name_passes_through_standard_characters(): 17 | account_name = 'abc1234.5678-90-XYZ' 18 | 19 | assert sanitize_account_name(account_name) == account_name 20 | 21 | 22 | def test_sanitize_account_name_replaces_invalid_characters(): 23 | assert sanitize_account_name('1234$!5678:XYZ') == '1234-5678-XYZ' 24 | -------------------------------------------------------------------------------- /finance_dl/ofx_rename.py: -------------------------------------------------------------------------------- 1 | """Renames improperly-named OFX files generated by the finance_dl.ofx module""" 2 | import argparse 3 | import os 4 | 5 | import bs4 6 | 7 | from .ofx import get_ofx_date_range 8 | 9 | 10 | def fix_name(path, dry_run): 11 | name = os.path.basename(path) 12 | d = os.path.dirname(path) 13 | date_format = '%Y%m%d' 14 | 15 | parts = name.split('-') 16 | assert len(parts) == 4 17 | 18 | with open(path, 'rb') as f: 19 | date_range = get_ofx_date_range(f.read()) 20 | new_parts = [ 21 | date_range[0].strftime(date_format), date_range[1].strftime(date_format) 22 | ] + parts[2:] 23 | new_name = '-'.join(new_parts) 24 | if new_name != name: 25 | new_path = os.path.join(d, new_name) 26 | print('Rename %s -> %s' % (path, new_path)) 27 | if not dry_run: 28 | os.rename(path, new_path) 29 | 30 | 31 | if __name__ == '__main__': 32 | ap = argparse.ArgumentParser() 33 | ap.add_argument('paths', nargs='*') 34 | args = ap.parse_args() 35 | ap.add_argument('--dry-run', action='store_true') 36 | for path in args.paths: 37 | fix_name(path, dry_run=args.dry_run) 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | with open(os.path.join(os.path.dirname(__file__), 'README.md'), 'r') as f: 5 | long_description = f.read() 6 | 7 | setup( 8 | name='finance-dl', 9 | description='Tools for scraping personal financial data.', 10 | long_description=long_description, 11 | long_description_content_type='text/markdown', 12 | version='1.3.2', 13 | url='https://github.com/jbms/finance-dl', 14 | author='Jeremy Maitin-Shepard', 15 | author_email="jeremy@jeremyms.com", 16 | license='GPLv2', 17 | packages=["finance_dl"], 18 | entry_points={ 19 | 'console_scripts': [ 20 | 'finance-dl = finance_dl.cli:main', 21 | 'finance-dl-chromedriver-wrapper = finance_dl.chromedriver_wrapper:main', 22 | ], 23 | }, 24 | python_requires='>=3.5', 25 | install_requires=[ 26 | 'bs4', 27 | 'mintapi>=1.31', 28 | 'ofxclient', 29 | 'selenium', 30 | 'ipython', 31 | 'selenium-requests', 32 | 'chromedriver-binary', 33 | 'beancount>=2.1.2', 34 | 'atomicwrites>=1.3.0', 35 | 'jsonschema', 36 | ], 37 | tests_require=[ 38 | 'pytest', 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /finance_dl/csv_merge.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import collections 3 | import os 4 | 5 | from atomicwrites import atomic_write 6 | 7 | 8 | def merge_overlapping_csv_rows(csv_data_list, compare_fields): 9 | """Merge overlapping CSV files. 10 | 11 | Rows are compared based on the list 'compare_fields' of field names. 12 | The number of duplicate copies of a row kept in the result is equal 13 | to the maximum number of duplicates in any single file. 14 | 15 | :param csv_data_list: list of rows, each row being represented by a 16 | dict 17 | :param compare_fields: list of field names by which duplicates are 18 | detected. 19 | 20 | :return: Returns the merged list of rows. 21 | """ 22 | 23 | def convert_row(row): 24 | return tuple(row[field] for field in compare_fields) 25 | 26 | merged_counter = collections.Counter() 27 | merged_rows = [] 28 | for csv_data in csv_data_list: 29 | cur_counter = collections.Counter() 30 | for row in csv_data: 31 | converted_row = convert_row(row) 32 | cur_counter[converted_row] += 1 33 | if cur_counter[converted_row] > merged_counter[converted_row]: 34 | merged_rows.append(row) 35 | merged_counter[converted_row] += 1 36 | return merged_rows 37 | 38 | 39 | def write_csv(field_names, data, filename): 40 | with atomic_write(filename, mode='w', newline='', encoding='utf-8') as f: 41 | csv_writer = csv.DictWriter( 42 | f, field_names, lineterminator='\n', quoting=csv.QUOTE_ALL) 43 | csv_writer.writeheader() 44 | csv_writer.writerows(data) 45 | 46 | 47 | def merge_into_file(filename, 48 | field_names, 49 | data, 50 | sort_by=None, 51 | compare_fields=None): 52 | if compare_fields is None: 53 | compare_fields = field_names 54 | 55 | if os.path.exists(filename): 56 | with open(filename, 'r', newline='', encoding='utf-8') as f: 57 | reader = csv.DictReader(f) 58 | assert reader.fieldnames == field_names, (reader.fieldnames, field_names) 59 | existing_rows = list(reader) 60 | data = merge_overlapping_csv_rows([existing_rows, data], 61 | compare_fields=compare_fields) 62 | if sort_by is not None: 63 | data.sort(key=sort_by) 64 | write_csv(field_names=field_names, data=data, filename=filename) 65 | -------------------------------------------------------------------------------- /finance_dl/google_login.py: -------------------------------------------------------------------------------- 1 | """Handles Google account login.""" 2 | 3 | import logging 4 | from typing import Dict, cast, Any 5 | 6 | from selenium.webdriver.common.by import By 7 | from selenium.webdriver.common.keys import Keys 8 | 9 | from . import scrape_lib 10 | 11 | logger = logging.getLogger('google_login') 12 | 13 | 14 | def login(scraper: scrape_lib.Scraper, login_url: str): 15 | logger.info('Initiating log in') 16 | with scraper.wait_for_page_load(): 17 | scraper.driver.get(login_url) 18 | 19 | cur_url = scraper.driver.current_url 20 | if not cur_url.startswith('https://accounts.google.com/'): 21 | logger.info('Assuming already logged in due to url of %s', cur_url) 22 | return 23 | 24 | logger.info('Waiting for username or password field') 25 | 26 | def find_username_or_other_account_button(): 27 | username = scraper.find_visible_elements(By.XPATH, 28 | '//input[@type="email"]') 29 | password = scraper.find_visible_elements(By.XPATH, 30 | '//input[@type="password"]') 31 | other_account = scraper.find_visible_elements( 32 | By.XPATH, '//div[text()="Use another account"]') 33 | if len(username) == 1: 34 | return username[0], None, None 35 | if len(password) == 1: 36 | return None, password[0], None 37 | if len(other_account) == 1: 38 | return None, None, other_account[0] 39 | return None 40 | 41 | (username, password, other_account_button 42 | ), = scraper.wait_and_return(find_username_or_other_account_button) 43 | if other_account_button: 44 | scraper.click(other_account_button) 45 | (username, ), = scraper.wait_and_return( 46 | lambda: scraper.find_visible_elements(By.XPATH, '//input[@type="email"]') 47 | ) 48 | credentials = cast(Any, scraper).credentials # type: Dict[str, str] 49 | if not password: 50 | logger.info('Entering username') 51 | username.send_keys(credentials['username']) 52 | username.send_keys(Keys.ENTER) 53 | logger.info('Waiting for password field') 54 | (password, ), = scraper.wait_and_return( 55 | lambda: scraper.find_visible_elements(By.XPATH, '//input[@type="password"]') 56 | ) 57 | logger.info('Entering password') 58 | password.send_keys(credentials['password']) 59 | password.send_keys(Keys.ENTER) 60 | -------------------------------------------------------------------------------- /finance_dl/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib 3 | import logging 4 | import json 5 | 6 | 7 | def get_log_level(name): 8 | name = name.upper() 9 | numeric_level = getattr(logging, name, None) 10 | if not isinstance(numeric_level, int): 11 | raise ValueError('Invalid log level: %s' % name) 12 | return name 13 | 14 | 15 | def main(): 16 | ap = argparse.ArgumentParser() 17 | ap.add_argument('--config-module', type=str, 18 | help='Python module defining CONFIG_ functions.') 19 | spec_group = ap.add_mutually_exclusive_group(required=True) 20 | spec_group.add_argument('--config', '-c', type=str, 21 | help='Configuration name to use.') 22 | spec_group.add_argument('--spec', '-s', type=json.loads, 23 | help='JSON configuration specification') 24 | ap.add_argument('--interactive', '-i', action='store_true', default=False, 25 | help='Start interactive shell.') 26 | ap.add_argument( 27 | '--visible', action='store_true', help= 28 | 'Run with a visible browser (if applicable). Implied by --interactive.' 29 | ) 30 | ap.add_argument('--log', type=get_log_level, default=logging.INFO, 31 | help='Log level.') 32 | args = ap.parse_args() 33 | logging.basicConfig( 34 | level=args.log, 35 | format='%(asctime)s %(filename)s:%(lineno)d [%(levelname)s] %(message)s') 36 | 37 | if args.config_module: 38 | config_module = importlib.import_module(args.config_module) 39 | else: 40 | config_module = object() 41 | 42 | if args.config: 43 | key_prefix = 'CONFIG_' 44 | config_key = key_prefix + args.config 45 | if config_key is None: 46 | valid_keys = sorted( 47 | k for k in vars(config_module) if k.startswith(key_prefix)) 48 | raise KeyError( 49 | 'Invalid configuration key: %r. Valid configuration keys: %r.' 50 | % (config_key, valid_keys)) 51 | spec = getattr(config_module, config_key, None)() 52 | else: 53 | spec = args.spec 54 | module_name = spec.pop('module') 55 | module = importlib.import_module(module_name) 56 | 57 | headless = not args.visible 58 | if args.interactive: 59 | headless = False 60 | spec.setdefault('headless', headless) 61 | 62 | if args.interactive: 63 | 64 | def run_interactive_shell(**ns): 65 | import IPython 66 | user_ns = dict(vars(module), **ns) 67 | 68 | # Don't leave __name__ set, as that causes IPython to override the 69 | # real module's entry in sys.modules. 70 | user_ns.pop('__name__', None) 71 | IPython.terminal.ipapp.launch_new_instance( 72 | argv=[ 73 | '--no-banner', 74 | '--no-autoindent', 75 | '--InteractiveShellApp.exec_lines=["%load_ext autoreload", "%autoreload 2"]', 76 | ], 77 | user_ns=user_ns, 78 | ) 79 | 80 | interactive_func = getattr(module, 'interactive', None) 81 | if interactive_func is not None: 82 | with interactive_func(**spec) as ns: 83 | run_interactive_shell(**ns) 84 | else: 85 | run_interactive_shell() 86 | else: 87 | module.run(**spec) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /finance_dl/google_takeout.py: -------------------------------------------------------------------------------- 1 | """Retrieves Google data using https://takeout.google.com 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Google Takeout website. 5 | 6 | This is not itself a finance_dl data source, but is used by the 7 | `finance_dl.google_purchases` module. 8 | """ 9 | 10 | from typing import List, Any, Iterable, FrozenSet 11 | import urllib.parse 12 | import re 13 | import io 14 | import logging 15 | import time 16 | import zipfile 17 | from selenium.webdriver.common.by import By 18 | from . import scrape_lib 19 | from . import google_login 20 | 21 | logger = logging.getLogger('google_takeout') 22 | 23 | netloc_re = r'^([^\.@]+\.)*google.com$' 24 | 25 | 26 | def check_url(url): 27 | result = urllib.parse.urlparse(url) 28 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 29 | raise RuntimeError('Reached invalid URL: %r' % url) 30 | 31 | 32 | class Scraper(scrape_lib.Scraper): 33 | def __init__(self, credentials: dict, **kwargs): 34 | super().__init__(**kwargs) 35 | self.credentials = credentials 36 | 37 | def check_after_wait(self): 38 | check_url(self.driver.current_url) 39 | 40 | def _get_categories(self): 41 | categories, = self.wait_and_return(lambda: self.driver.find_elements( 42 | By.XPATH, '//input[@type="checkbox"]')) 43 | return categories 44 | 45 | def _create_archive(self, categories: FrozenSet[str]): 46 | logger.info('Selecting categories') 47 | checkboxes = self._get_categories() 48 | found_ids = set() 49 | for checkbox in checkboxes: 50 | value = checkbox.get_attribute('value') 51 | found_ids.add(value) 52 | wanted = value in categories 53 | checked = checkbox.get_attribute('checked') == 'true' 54 | if wanted != checked: 55 | checkbox.click() 56 | remaining = categories - found_ids 57 | if remaining: 58 | raise RuntimeError( 59 | 'Categories not found: %s' % ', '.join(sorted(remaining))) 60 | logger.info('Creating archive') 61 | checkboxes[0].submit() 62 | 63 | def _get_download_links(self): 64 | download_links = self.driver.find_elements(By.XPATH, 65 | '//a[.="Download"]') 66 | return [x.get_attribute('href') for x in download_links] 67 | 68 | def get_takeout_zipfile(self, categories: Iterable[str]) -> zipfile.ZipFile: 69 | """Returns a zipfile containing the specified takeout categories.""" 70 | google_login.login(self, 71 | 'https://takeout.google.com/settings/takeout/light') 72 | # Wait for at least one checkbox 73 | self._get_categories() 74 | # Wait 2 seconds to be sure all have loaded and then get new checkboxes 75 | time.sleep(2) 76 | # Get existing download links 77 | download_links = self._get_download_links() 78 | self._create_archive(categories=frozenset(categories)) 79 | 80 | for attempt_i in range(3): 81 | logger.info('Waiting for new download links (attempt %d)', 82 | attempt_i + 1) 83 | # Wait 10 seconds for the archive to be created 84 | time.sleep(10) 85 | with self.wait_for_page_load(): 86 | self.driver.refresh() 87 | new_download_links = set( 88 | self._get_download_links()) - set(download_links) 89 | if len(new_download_links) == 0: continue 90 | if len(new_download_links) > 1: 91 | raise RuntimeError('More than one new archive found') 92 | break 93 | new_download_link = list(new_download_links)[0] 94 | logger.info('Downloading archive') 95 | google_login.login(self, new_download_link) 96 | (_, data), = self.wait_and_return(self.get_downloaded_file) 97 | return zipfile.ZipFile(io.BytesIO(data)) 98 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | # Align closing bracket with visual indentation. 3 | ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT=True 4 | 5 | # Allow lambdas to be formatted on more than one line. 6 | ALLOW_MULTILINE_LAMBDAS=True 7 | 8 | # Insert a blank line before a 'def' or 'class' immediately nested 9 | # within another 'def' or 'class'. For example: 10 | # 11 | # class Foo: 12 | # # <------ this blank line 13 | # def method(): 14 | # ... 15 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF=False 16 | 17 | # The column limit. 18 | COLUMN_LIMIT=80 19 | 20 | # Indent width used for line continuations. 21 | CONTINUATION_INDENT_WIDTH=4 22 | 23 | # Put closing brackets on a separate line, dedented, if the bracketed 24 | # expression can't fit in a single line. Applies to all kinds of brackets, 25 | # including function definitions and calls. For example: 26 | # 27 | # config = { 28 | # 'key1': 'value1', 29 | # 'key2': 'value2', 30 | # } # <--- this bracket is dedented and on a separate line 31 | # 32 | # time_series = self.remote_client.query_entity_counters( 33 | # entity='dev3246.region1', 34 | # key='dns.query_latency_tcp', 35 | # transform=Transformation.AVERAGE(window=timedelta(seconds=60)), 36 | # start_ts=now()-timedelta(days=3), 37 | # end_ts=now(), 38 | # ) # <--- this bracket is dedented and on a separate line 39 | DEDENT_CLOSING_BRACKETS=False 40 | 41 | # The regex for an i18n comment. The presence of this comment stops 42 | # reformatting of that line, because the comments are required to be 43 | # next to the string they translate. 44 | I18N_COMMENT= 45 | 46 | # The i18n function call names. The presence of this function stops 47 | # reformattting on that line, because the string it has cannot be moved 48 | # away from the i18n comment. 49 | I18N_FUNCTION_CALL= 50 | 51 | # Indent the dictionary value if it cannot fit on the same line as the 52 | # dictionary key. For example: 53 | # 54 | # config = { 55 | # 'key1': 56 | # 'value1', 57 | # 'key2': value1 + 58 | # value2, 59 | # } 60 | INDENT_DICTIONARY_VALUE=False 61 | 62 | # The number of columns to use for indentation. 63 | INDENT_WIDTH=4 64 | 65 | # Join short lines into one line. E.g., single line 'if' statements. 66 | JOIN_MULTIPLE_LINES=True 67 | 68 | # Use spaces around the power operator. 69 | SPACES_AROUND_POWER_OPERATOR=False 70 | 71 | # The number of spaces required before a trailing comment. 72 | SPACES_BEFORE_COMMENT=2 73 | 74 | # Insert a space between the ending comma and closing bracket of a list, 75 | # etc. 76 | SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET=True 77 | 78 | # Split before arguments if the argument list is terminated by a 79 | # comma. 80 | SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED=True 81 | 82 | # Set to True to prefer splitting before '&', '|' or '^' rather than 83 | # after. 84 | SPLIT_BEFORE_BITWISE_OPERATOR=False 85 | 86 | # If an argument / parameter list is going to be split, then split before 87 | # the first argument. 88 | SPLIT_BEFORE_FIRST_ARGUMENT=False 89 | 90 | # Set to True to prefer splitting before 'and' or 'or' rather than 91 | # after. 92 | SPLIT_BEFORE_LOGICAL_OPERATOR=False 93 | 94 | # Split named assignments onto individual lines. 95 | SPLIT_BEFORE_NAMED_ASSIGNS=True 96 | 97 | # The penalty for splitting right after the opening bracket. 98 | SPLIT_PENALTY_AFTER_OPENING_BRACKET=30 99 | 100 | # The penalty for splitting the line after a unary operator. 101 | SPLIT_PENALTY_AFTER_UNARY_OPERATOR=10000 102 | 103 | # The penalty for splitting right before an if expression. 104 | SPLIT_PENALTY_BEFORE_IF_EXPR=0 105 | 106 | # The penalty of splitting the line around the '&', '|', and '^' 107 | # operators. 108 | SPLIT_PENALTY_BITWISE_OPERATOR=300 109 | 110 | # The penalty for characters over the column limit. 111 | SPLIT_PENALTY_EXCESS_CHARACTER=2600 112 | 113 | # The penalty incurred by adding a line split to the unwrapped line. The 114 | # more line splits added the higher the penalty. 115 | SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT=30 116 | 117 | # The penalty of splitting a list of "import as" names. For example: 118 | # 119 | # from a_very_long_or_indented_module_name_yada_yad import (long_argument_1, 120 | # long_argument_2, 121 | # long_argument_3) 122 | # 123 | # would reformat to something like: 124 | # 125 | # from a_very_long_or_indented_module_name_yada_yad import ( 126 | # long_argument_1, long_argument_2, long_argument_3) 127 | SPLIT_PENALTY_IMPORT_NAMES=0 128 | 129 | # The penalty of splitting the line around the 'and' and 'or' 130 | # operators. 131 | SPLIT_PENALTY_LOGICAL_OPERATOR=300 132 | 133 | # Use the Tab character for indentation. 134 | USE_TABS=False 135 | -------------------------------------------------------------------------------- /finance_dl/ebmud.py: -------------------------------------------------------------------------------- 1 | """Retrieves East Bay Municipal Utility District (EBMUD) PDF water bills. 2 | 3 | These PDF bills can be parsed by extracting the text using `pdftotext`. 4 | 5 | This uses the `selenium` Python package in conjunction with `chromedriver` to 6 | scrape the Stockplanconnect website. 7 | 8 | Configuration: 9 | ============== 10 | 11 | The following keys may be specified as part of the configuration dict: 12 | 13 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 14 | keys. 15 | 16 | - `output_directory`: Required. Must be a `str` that specifies the path on the 17 | local filesystem where the bills will be saved. If the directory does not 18 | exist, it will be created. 19 | 20 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 21 | path to a persistent Chrome browser profile to use. This should be a path 22 | used solely for this single configuration; it should not refer to your normal 23 | browser profile. If not specified, a fresh temporary profile will be used 24 | each time. 25 | 26 | Output format: 27 | ============== 28 | 29 | Each statement is saved to the `output_directory` with a name like: 30 | 31 | 2017-11-28.bill.pdf 32 | 33 | The date corresponds to the "Bill Date" of the bill. 34 | 35 | Example: 36 | ======== 37 | 38 | def CONFIG_ebmud(): 39 | return dict( 40 | module='finance_dl.ebmud', 41 | credentials={ 42 | 'username': 'XXXXXX', 43 | 'password': 'XXXXXX', 44 | }, 45 | output_directory=os.path.join(data_dir, 'ebmud'), 46 | ) 47 | 48 | 49 | Interactive shell: 50 | ================== 51 | 52 | From the interactive shell, type: `self.run()` to start the scraper. 53 | 54 | """ 55 | 56 | import re 57 | import logging 58 | import os 59 | 60 | import urllib.parse 61 | import dateutil.parser 62 | from selenium.webdriver.common.by import By 63 | from selenium.webdriver.support.ui import Select 64 | from selenium.webdriver.common.keys import Keys 65 | 66 | from . import scrape_lib 67 | 68 | logger = logging.getLogger('ebmud_scrape') 69 | 70 | netloc_re = r'^([^\.@]+\.)*ebmud.com$' 71 | 72 | 73 | def check_url(url): 74 | result = urllib.parse.urlparse(url) 75 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 76 | raise RuntimeError('Reached invalid URL: %r' % url) 77 | 78 | 79 | class Scraper(scrape_lib.Scraper): 80 | def __init__(self, credentials, output_directory, **kwargs): 81 | super().__init__(**kwargs) 82 | self.credentials = credentials 83 | self.output_directory = output_directory 84 | self.logged_in = False 85 | 86 | def check_after_wait(self): 87 | check_url(self.driver.current_url) 88 | 89 | def login(self): 90 | if self.logged_in: 91 | return 92 | logger.info('Initiating log in') 93 | self.driver.get( 94 | 'https://www.ebmud.com/customers/account/manage-your-account') 95 | 96 | (username, password), = self.wait_and_return( 97 | self.find_username_and_password_in_any_frame) 98 | logger.info('Entering username and password') 99 | username.send_keys(self.credentials['username']) 100 | password.send_keys(self.credentials['password']) 101 | with self.wait_for_page_load(): 102 | password.send_keys(Keys.ENTER) 103 | logger.info('Logged in') 104 | self.logged_in = True 105 | 106 | def get_statements(self): 107 | logger.info('Looking for statement link') 108 | statements_link, = self.wait_and_locate((By.LINK_TEXT, 109 | 'View Statements')) 110 | statements_link.click() 111 | 112 | (statements_table, ), = self.wait_and_return( 113 | lambda: self.find_visible_elements_by_descendant_partial_text('Statement Date', 'table') 114 | ) 115 | rows = statements_table.find_elements_by_xpath('tbody/tr/td') 116 | for row in rows: 117 | row_text_parts = row.text.split() 118 | assert len(row_text_parts) == 4 119 | statement_date = dateutil.parser.parse(row_text_parts[0]).date() 120 | output_date_format = '%Y-%m-%d' 121 | statement_path = os.path.join( 122 | self.output_directory, '%s.bill.pdf' % 123 | (statement_date.strftime(output_date_format), )) 124 | if os.path.exists(statement_path): 125 | logger.info('Skipping existing statement: %s', statement_path) 126 | continue 127 | logger.info('Downloading %s', statement_path) 128 | self.click(row) 129 | download_result, = self.wait_and_return(self.get_downloaded_file) 130 | tmp_path = statement_path + '.tmp' 131 | with open(tmp_path, 'wb') as f: 132 | f.write(download_result[1]) 133 | os.rename(tmp_path, statement_path) 134 | logger.info('Wrote %s', statement_path) 135 | 136 | def run(self): 137 | self.login() 138 | self.get_statements() 139 | 140 | 141 | def run(**kwargs): 142 | scrape_lib.run_with_scraper(Scraper, **kwargs) 143 | 144 | 145 | def interactive(**kwargs): 146 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Python package for scraping personal financial data from financial 2 | institutions. 3 | 4 | [![License: GPL v2](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](LICENSE) 5 | [![Build Status](https://travis-ci.com/jbms/finance-dl.svg?branch=master)](https://travis-ci.com/jbms/finance-dl) 6 | 7 | This package may be useful on its own, but is specifically designed to be 8 | used with 9 | [beancount-import](https://github.com/jbms/beancount-import). 10 | 11 | Supported data sources 12 | == 13 | 14 | - [finance_dl.ofx](finance_dl/ofx.py): uses 15 | [ofxclient](https://github.com/captin411/ofxclient) to download data 16 | using the OFX protocol. 17 | - [finance_dl.mint](finance_dl/mint.py): uses 18 | [mintapi](https://github.com/mrooney/mintapi) to download data from 19 | the Mint.com website. 20 | - [finance_dl.venmo](finance_dl/venmo.py): downloads transaction and 21 | balance information from the Venmo.com website 22 | - [finance_dl.paypal](finance_dl/paypal.py): downloads transactions 23 | from the Paypal.com website 24 | - [finance_dl.amazon](finance_dl/amazon.py): downloads order invoices 25 | from the Amazon website 26 | - [finance_dl.healthequity](finance_dl/healthequity.py): downloads 27 | transaction history and balance information from the HealthEquity 28 | website. 29 | - [finance_dl.google_purchases](finance_dl/google_purchases.py): 30 | downloads purchases that Google has heuristically extracted from 31 | Gmail messages. 32 | - [finance_dl.stockplanconnect](finance_dl/stockplanconnect.py): 33 | downloads PDF documents (including release and trade confirmations) 34 | from the Morgan Stanley Stockplanconnect website. 35 | - [finance_dl.pge](finance_dl/pge.py): downloads Pacific Gas & 36 | Electric (PG&E) PDF bills. 37 | - [finance_dl.comcast](finance_dl/comcast.py): downloads Comcast PDF 38 | bills. 39 | - [finance_dl.ebmud](finance_dl/ebmud.py): downloads East Bay 40 | Municipal Utility District (EBMUD) water bills. 41 | - [finance_dl.anthem](finance_dl/anthem.py): downloads Anthem 42 | BlueCross insurance claim statements. 43 | - [finance_dl.waveapps](finance_dl/waveapps.py): downloads receipt 44 | images and extracted transaction data from 45 | [Wave](https://waveapps.com), which is a free receipt-scanning 46 | website/mobile app. 47 | - [finance_dl.ultipro_google](finance_dl/ultipro_google.py): downloads 48 | Google employee payroll statements in PDF format from Ultipro. 49 | 50 | Setup 51 | == 52 | 53 | To install the most recent published package from PyPi, simply type: 54 | 55 | ```shell 56 | pip install finance-dl 57 | ``` 58 | 59 | To install from a clone of the repository, type: 60 | 61 | ```shell 62 | pip install . 63 | ``` 64 | 65 | or for development: 66 | 67 | ```shell 68 | pip install -e . 69 | ``` 70 | 71 | Configuration 72 | == 73 | 74 | Create a Python file like `example_finance_dl_config.py`. 75 | 76 | Refer to the documentation of the individual scraper modules for 77 | details. 78 | 79 | Basic Usage 80 | == 81 | 82 | You can run a scraping configuration named `myconfig` as follows: 83 | 84 | python -m finance_dl.cli --config-module example_finance_dl_config --config myconfig 85 | 86 | The configuration `myconfig` refers to a function named 87 | `CONFIG_myconfig` in the configuration module. 88 | 89 | Make sure that your configuration module is accessible in your Python 90 | `sys.path`. Since `sys.path` includes the current directory by 91 | default, you can simply run this command from the directory that 92 | contains your configuration module. 93 | 94 | By default, the scrapers run fully automatically, and the ones based 95 | on `selenium` and `chromedriver` run in headless mode. If the initial 96 | attempt for a `selenium`-based scraper fails, it is automatically 97 | retried again with the browser window visible. This allows you to 98 | manually complete the login process and enter any multi-factor 99 | authentication code that is required. 100 | 101 | To debug a scraper, you can run it in interactive mode by specifying 102 | the `-i` command-line argument. This runs an interactive IPython 103 | shell that lets you manually invoke parts of the scraping process. 104 | 105 | Automatic Usage 106 | == 107 | 108 | To run multiple configurations at once, and keep track of when each 109 | configuration was last updated, you can use the `finance_dl.update` 110 | tool. 111 | 112 | To display the update status, first create a `logs` directory and run: 113 | 114 | python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs status 115 | 116 | Initially, this will indicate that none of the configurations have 117 | been updated. To update a single configuration `myconfig`, run: 118 | 119 | python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update myconfig 120 | 121 | With a single configuration specified, this does the same thing as the 122 | `finance_dl.cli` tool, except that the log messages are written to 123 | `logs/myconfig.txt` and a `logs/myconfig.lastupdate` file is created 124 | if it is successful. 125 | 126 | If multiple configurations are specified, as in: 127 | 128 | python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update myconfig1 myconfig2 129 | 130 | then all specified configurations are run in parallel. 131 | 132 | To update all configurations, run: 133 | 134 | python -m finance_dl.update --config-module example_finance_dl_config --log-dir logs update --all 135 | 136 | License 137 | == 138 | 139 | Copyright (C) 2014-2018 Jeremy Maitin-Shepard. 140 | 141 | Distributed under the GNU General Public License, Version 2.0 only. 142 | See [LICENSE](LICENSE) file for details. 143 | -------------------------------------------------------------------------------- /finance_dl/google_purchases.py: -------------------------------------------------------------------------------- 1 | """Retrieves purchase and reservation history from Google. 2 | 3 | This contains purchases that have been heuristically extracted from Gmail 4 | messages, and possibly other sources. 5 | 6 | This uses the `selenium` Python package in conjunction with `chromedriver` to 7 | scrape the Google Takeout and Google purchases/reservations websites. 8 | 9 | Configuration: 10 | ============== 11 | 12 | The following keys may be specified as part of the configuration dict: 13 | 14 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 15 | keys. 16 | 17 | - `output_directory`: Required. Must be a `str` that specifies the path on the 18 | local filesystem where the output will be written. If the directory does not 19 | exist, it will be created. 20 | 21 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 22 | path to a persistent Chrome browser profile to use. This should be a path 23 | used solely for this single configuration; it should not refer to your normal 24 | browser profile. If not specified, a fresh temporary profile will be used 25 | each time. 26 | 27 | Output format: 28 | ============== 29 | 30 | For each purchase, two files are written to the specified `output_directory`: 31 | `.html` contains the raw HTML content of the order details page, and 32 | `order_.json` is a JSON file in the Google Takeout Purchases/Reservations 33 | format. 34 | 35 | Example: 36 | ======== 37 | 38 | def CONFIG_google_purchases(): 39 | return dict( 40 | module='finance_dl.google_purchases', 41 | credentials={ 42 | 'username': 'XXXXXX', 43 | 'password': 'XXXXXX', 44 | }, 45 | output_directory=os.path.join(data_dir, 'google_purchases'), 46 | # profile_dir is optional. 47 | profile_dir=os.path.join(profile_dir, 'google_purchases'), 48 | ) 49 | 50 | Interactive shell: 51 | ================== 52 | 53 | From the interactive shell, type: `self.run()` to start the scraper. 54 | 55 | """ 56 | 57 | from typing import List, Any, Tuple 58 | import urllib.parse 59 | import re 60 | import json 61 | import logging 62 | import os 63 | from selenium.webdriver.common.by import By 64 | from selenium.webdriver.support.ui import Select 65 | from selenium.webdriver.common.keys import Keys 66 | from selenium.common.exceptions import NoSuchElementException 67 | import jsonschema 68 | from atomicwrites import atomic_write 69 | from . import scrape_lib 70 | from . import google_login 71 | from . import google_takeout 72 | 73 | logger = logging.getLogger('google_purchases') 74 | 75 | netloc_re = r'^([^\.@]+\.)*google.com$' 76 | 77 | 78 | def check_url(url): 79 | result = urllib.parse.urlparse(url) 80 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 81 | raise RuntimeError('Reached invalid URL: %r' % url) 82 | 83 | class Scraper(google_takeout.Scraper): 84 | def __init__(self, output_directory: str, **kwargs): 85 | super().__init__(**kwargs) 86 | self.output_directory = output_directory 87 | 88 | def check_after_wait(self): 89 | check_url(self.driver.current_url) 90 | 91 | def extract_raw_data(self): 92 | source = self.driver.page_source 93 | prefix = 'data:function(){return ' 94 | start_index = source.index(prefix) + len(prefix) 95 | source_suffix = source[start_index:] 96 | try: 97 | value = json.loads(source_suffix) 98 | raise ValueError('Expected error parsing JSON') 99 | except json.JSONDecodeError as e: 100 | encoded_json = source_suffix[:e.pos] 101 | value = json.loads(encoded_json) 102 | return value 103 | 104 | def _fetch_html_pages(self, need_to_fetch: List[Tuple[str, str]]): 105 | logger.info('Fetching details for %d purchases', len(need_to_fetch)) 106 | for i, (purchase_id, html_path) in enumerate(need_to_fetch): 107 | url = 'https://myaccount.google.com/purchases/detail?order_id=' + purchase_id 108 | logger.info('Fetching details %d/%d: %s', i, len(need_to_fetch), url) 109 | with self.wait_for_page_load(): 110 | self.driver.get(url) 111 | content = self.driver.page_source 112 | with atomic_write( 113 | html_path, mode='w', encoding='utf-8', newline='\n') as f: 114 | # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8 115 | f.write('\ufeff' + content) 116 | logger.info('Write details %d/%d: %s', i, len(need_to_fetch), html_path) 117 | 118 | def run(self): 119 | if not os.path.exists(self.output_directory): 120 | os.makedirs(self.output_directory) 121 | 122 | self.download_data() 123 | 124 | def download_data(self): 125 | takeout_zip = self.get_takeout_zipfile(['my_orders']) 126 | need_to_fetch = [] 127 | for name in takeout_zip.namelist(): 128 | m = re.match(r'.*/order_([0-9]+)\.json$', name) 129 | if m is None: 130 | logger.info('Ignoring file in takeout archive: %s', name) 131 | continue 132 | order_id = m.group(1) 133 | json_path = os.path.join(self.output_directory, 134 | 'order_' + order_id + '.json') 135 | if not os.path.exists(json_path): 136 | with atomic_write(json_path, mode='wb') as f: 137 | f.write(takeout_zip.read(name)) 138 | html_path = os.path.join(self.output_directory, order_id + '.html') 139 | if os.path.exists(html_path): 140 | continue 141 | need_to_fetch.append((order_id, html_path)) 142 | self._fetch_html_pages(need_to_fetch) 143 | 144 | 145 | def run(**kwargs): 146 | scrape_lib.run_with_scraper(Scraper, **kwargs) 147 | 148 | 149 | def interactive(**kwargs): 150 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 151 | -------------------------------------------------------------------------------- /example_finance_dl_config.py: -------------------------------------------------------------------------------- 1 | """Example configuration file for finance_dl. 2 | 3 | Configuration entries are defined by defining a top-level function with a name 4 | beginning with `CONFIG_`. The portion after the `CONFIG_` prefix is the name 5 | of the configuration. 6 | 7 | Rather than hard code your usernames and passwords into this configuration 8 | file, you may instead wish to write some code to retrieve them from some 9 | external password store. 10 | """ 11 | 12 | import os 13 | 14 | # Directory for persistent browser profiles. 15 | profile_dir = os.path.join(os.getenv('HOME'), '.cache', 'finance_dl') 16 | data_dir = '/path/where/data/will/be/saved' 17 | 18 | 19 | def CONFIG_vanguard(): 20 | # To determine the correct values for `id`, `org`, and `url` for your 21 | # financial institution, search on https://www.ofxhome.com/ 22 | ofx_params = { 23 | 'id': '15103', 24 | 'org': 'Vanguard', 25 | 'url': 'https://vesnc.vanguard.com/us/OfxDirectConnectServlet', 26 | 'username': 'XXXXXX', 27 | 'password': 'XXXXXX', 28 | } 29 | return dict( 30 | module='finance_dl.ofx', 31 | ofx_params=ofx_params, 32 | output_directory=os.path.join(data_dir, 'vanguard'), 33 | ) 34 | 35 | 36 | def CONFIG_amazon(): 37 | return dict( 38 | module='finance_dl.amazon', 39 | credentials={ 40 | 'username': 'XXXXXX', 41 | 'password': 'XXXXXX', 42 | }, 43 | output_directory=os.path.join(data_dir, 'amazon'), 44 | # profile_dir is optional. 45 | profile_dir=os.path.join(profile_dir, 'amazon'), 46 | ) 47 | 48 | 49 | def CONFIG_mint(): 50 | return dict( 51 | module='finance_dl.mint', 52 | credentials={ 53 | 'username': 'XXXXXX', 54 | 'password': 'XXXXXX', 55 | }, 56 | output_directory=os.path.join(data_dir, 'mint'), 57 | # profile_dir is optional, but highly recommended to avoid having to 58 | # enter multi-factor authentication code each time. 59 | profile_dir=os.path.join(profile_dir, 'mint'), 60 | ) 61 | 62 | 63 | def CONFIG_healthequity(): 64 | return dict( 65 | module='finance_dl.healthequity', 66 | credentials={ 67 | 'username': 'XXXXXX', 68 | 'password': 'XXXXXX', 69 | }, 70 | # Use your HealthEquity account number as the last directory component. 71 | output_directory=os.path.join(data_dir, 'healthequity', '1234567'), 72 | 73 | # profile_dir is optional but highly recommended to avoid having to 74 | # enter multi-factor authentication code each time. 75 | profile_dir=os.path.join(profile_dir, 'healthequity'), 76 | ) 77 | 78 | 79 | def CONFIG_venmo(): 80 | return dict( 81 | module='finance_dl.venmo', 82 | credentials={ 83 | 'username': 'XXXXXX', 84 | 'password': 'XXXXXX', 85 | }, 86 | output_directory=os.path.join(data_dir, 'venmo'), 87 | 88 | # profile_dir is optional but highly recommended to avoid having to 89 | # enter multi-factor authentication code each time. 90 | profile_dir=os.path.join(profile_dir, 'venmo'), 91 | ) 92 | 93 | 94 | def CONFIG_paypal(): 95 | return dict( 96 | module='finance_dl.paypal', 97 | credentials={ 98 | 'username': 'XXXXXX', 99 | 'password': 'XXXXXX', 100 | }, 101 | output_directory=os.path.join(data_dir, 'paypal'), 102 | ) 103 | 104 | 105 | def CONFIG_google_purchases(): 106 | return dict( 107 | module='finance_dl.google_purchases', 108 | credentials={ 109 | 'username': 'XXXXXX', 110 | 'password': 'XXXXXX', 111 | }, 112 | output_directory=os.path.join(data_dir, 'google_purchases'), 113 | ) 114 | 115 | 116 | def CONFIG_stockplanconnect(): 117 | return dict( 118 | module='finance_dl.stockplanconnect', 119 | credentials={ 120 | 'username': 'XXXXXX', 121 | 'password': 'XXXXXX', 122 | }, 123 | output_directory=os.path.join(data_dir, 'stockplanconnect'), 124 | headless=False, 125 | ) 126 | 127 | 128 | def CONFIG_pge(): 129 | return dict( 130 | module='finance_dl.pge', 131 | credentials={ 132 | 'username': 'XXXXXX', 133 | 'password': 'XXXXXX', 134 | }, 135 | output_directory=os.path.join(data_dir, 'pge'), 136 | ) 137 | 138 | 139 | def CONFIG_comcast(): 140 | return dict( 141 | module='finance_dl.comcast', 142 | credentials={ 143 | 'username': 'XXXXXX', 144 | 'password': 'XXXXXX', 145 | }, 146 | output_directory=os.path.join(data_dir, 'comcast'), 147 | ) 148 | 149 | 150 | def CONFIG_ebmud(): 151 | return dict( 152 | module='finance_dl.ebmud', 153 | credentials={ 154 | 'username': 'XXXXXX', 155 | 'password': 'XXXXXX', 156 | }, 157 | output_directory=os.path.join(data_dir, 'ebmud'), 158 | ) 159 | 160 | 161 | def CONFIG_anthem(): 162 | return dict( 163 | module='finance_dl.anthem', 164 | login_url='https://anthem.com', 165 | output_directory=os.path.join(data_dir, 'anthem'), 166 | profile_dir=os.path.join(profile_dir, 'anthem'), 167 | headless=False, 168 | ) 169 | 170 | 171 | def CONFIG_waveapps(): 172 | return dict( 173 | module='finance_dl.waveapps', 174 | credentials=dict( 175 | token='XXXXXXXX', 176 | ), 177 | output_directory=os.path.join(data_dir, 'waveapps'), 178 | ) 179 | 180 | 181 | def CONFIG_google_payroll(): 182 | return dict( 183 | module='finance_dl.ultipro_google', 184 | credentials={ 185 | 'username': 'XXXXXX', 186 | 'password': 'XXXXXX', 187 | }, 188 | output_directory=os.path.join(data_dir, 'documents', 'Income', 189 | 'Google'), 190 | 191 | # profile_dir is optional but recommended. 192 | profile_dir=os.path.join(profile_dir, 'google_payroll'), 193 | 194 | # Recommended for greater reliability. 195 | headless=False, 196 | ) 197 | -------------------------------------------------------------------------------- /finance_dl/anthem.py: -------------------------------------------------------------------------------- 1 | """Retrieves Anthem BlueCross Explanation of Benefits (EOB) statements. 2 | 3 | Due to automation countermeasures implemented by Anthem, this module is only 4 | semi-automatic: the user must manually login and navigate to the claims page. 5 | 6 | This uses the `selenium` Python package in conjunction with `chromedriver` to 7 | scrape the Anthem website. 8 | 9 | Configuration: 10 | ============== 11 | 12 | The following keys may be specified as part of the configuration dict: 13 | 14 | - `login_url`: Required. Must be a `str` that specifies the initial URL at 15 | which to start. The user is responsible for manually logging in and 16 | navigating to the claims page. 17 | 18 | - `output_directory`: Required. Must be a `str` that specifies the path on the 19 | local filesystem where the output will be written. If the directory does not 20 | exist, it will be created. 21 | 22 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 23 | path to a persistent Chrome browser profile to use. This should be a path 24 | used solely for this single configuration; it should not refer to your normal 25 | browser profile. If not specified, a fresh temporary profile will be used 26 | each time. 27 | 28 | - `headless`: Must be set to `False`, since this scraper requires manual input. 29 | 30 | Example: 31 | ======== 32 | 33 | def CONFIG_anthem(): 34 | return dict( 35 | module='finance_dl.anthem', 36 | login_url='https://anthem.com', 37 | output_directory=os.path.join(data_dir, 'anthem'), 38 | 39 | # profile_dir is optional but recommended. 40 | profile_dir=os.path.join(profile_dir, 'anthem'), 41 | 42 | # headless must be `False` since manual intervention is required 43 | headless=False, 44 | ) 45 | 46 | Output format: 47 | ============== 48 | 49 | For each claim, two files are written to the specified `output_directory`: 50 | `.json` contains a JSON representation of the claim as returned by the 51 | Anthem server, and `.pdf` contains the PDF "Explanation of Benefits" 52 | statement for the claim. 53 | 54 | The JSON file contains output of the form: 55 | 56 | { 57 | "patient": { 58 | "displayName": "John Smith", 59 | "uniqueId": "123456789", 60 | "allowsAccess": true 61 | }, 62 | "provider": "SOME MEDICAL PROVIDER", 63 | "totalCharges": 385, 64 | "serviceDate": "01/02/2017 00:00:00", 65 | "memberResponsibility": 111.05, 66 | "status": "Approved", 67 | "appliedToDeductible": 111.05, 68 | "claimNumber": "2017123AB1234" 69 | } 70 | 71 | 72 | 73 | Interactive shell: 74 | ================== 75 | 76 | From the interactive shell, type: `self.run()` to start the scraper. 77 | 78 | """ 79 | 80 | from typing import List, Any 81 | import urllib.parse 82 | import re 83 | import collections 84 | import json 85 | import logging 86 | import datetime 87 | import os 88 | from selenium.webdriver.common.by import By 89 | from selenium.webdriver.support.ui import Select 90 | from selenium.webdriver.common.keys import Keys 91 | from selenium.common.exceptions import NoSuchElementException 92 | import bs4 93 | import jsonschema 94 | from atomicwrites import atomic_write 95 | 96 | from . import scrape_lib 97 | from . import google_login 98 | 99 | logger = logging.getLogger('anthem') 100 | 101 | netloc_re = r'^([^\.@]+\.)*anthem.com$' 102 | 103 | 104 | class Scraper(scrape_lib.Scraper): 105 | def __init__(self, login_url: str, output_directory: str, **kwargs): 106 | super().__init__(use_seleniumrequests=True, **kwargs) 107 | self.login_url = login_url 108 | self.output_directory = output_directory 109 | 110 | def login(self): 111 | self.driver.get(self.login_url) 112 | 113 | def maybe_get_claims_json(self): 114 | try: 115 | soup = bs4.BeautifulSoup(self.driver.page_source, 'html.parser') 116 | return json.loads( 117 | soup.find(id='claimsJson').text, 118 | object_pairs_hook=collections.OrderedDict) 119 | except: 120 | raise NoSuchElementException 121 | 122 | def wait_for_claims_json(self): 123 | logger.info('Please login and navigate to the claims page') 124 | result = self.wait_and_return(self.maybe_get_claims_json, 125 | timeout=500)[0] 126 | logger.info('Claims data found') 127 | return result 128 | 129 | def save_documents(self): 130 | if not os.path.exists(self.output_directory): 131 | os.makedirs(self.output_directory) 132 | claims_json = self.wait_for_claims_json() 133 | downloads_needed = [] 134 | for claim in claims_json['claims']: 135 | url = claim['eobLinkUrl'] 136 | pdf_path = os.path.join(self.output_directory, 137 | claim['claimNumber'] + '.pdf') 138 | json_path = os.path.join(self.output_directory, 139 | claim['claimNumber'] + '.json') 140 | if not os.path.exists(json_path): 141 | with atomic_write( 142 | json_path, mode='w', encoding='utf-8', 143 | newline='\n') as f: 144 | f.write(json.dumps(claim, indent=' ').strip() + '\n') 145 | if not os.path.exists(pdf_path): 146 | if not claim['eobLinkUrl'].startswith('https:/'): continue 147 | downloads_needed.append((claim['eobLinkUrl'], pdf_path)) 148 | for i, (url, pdf_path) in enumerate(downloads_needed): 149 | logger.info('Downloading EOB %d/%d', i + 1, len(downloads_needed)) 150 | self.driver.get(url) 151 | download_result, = self.wait_and_return(self.get_downloaded_file) 152 | with atomic_write(pdf_path, mode='wb') as f: 153 | f.write(download_result[1]) 154 | 155 | def run(self): 156 | self.login() 157 | self.save_documents() 158 | 159 | 160 | def run(**kwargs): 161 | scrape_lib.run_with_scraper(Scraper, **kwargs) 162 | 163 | 164 | def interactive(**kwargs): 165 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 166 | -------------------------------------------------------------------------------- /finance_dl/pge.py: -------------------------------------------------------------------------------- 1 | """Retrieves Pacific Gas and Electric (PG&E) PDF bills. 2 | 3 | These PDF bills can be parsed by extracting the text using `pdftotext`. 4 | 5 | This uses the `selenium` Python package in conjunction with `chromedriver` to 6 | scrape the Stockplanconnect website. 7 | 8 | Configuration: 9 | ============== 10 | 11 | The following keys may be specified as part of the configuration dict: 12 | 13 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 14 | keys. 15 | 16 | - `output_directory`: Required. Must be a `str` that specifies the path on the 17 | local filesystem where the bills will be saved. If the directory does not 18 | exist, it will be created. 19 | 20 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 21 | path to a persistent Chrome browser profile to use. This should be a path 22 | used solely for this single configuration; it should not refer to your normal 23 | browser profile. If not specified, a fresh temporary profile will be used 24 | each time. 25 | 26 | Output format: 27 | ============== 28 | 29 | Each statement is saved to the `output_directory` with a name like: 30 | 31 | 2017-11-28.bill.pdf 32 | 33 | The date corresponds to the "Statement Date" of the bill. 34 | 35 | Example: 36 | ======== 37 | 38 | def CONFIG_pge(): 39 | return dict( 40 | module='finance_dl.pge', 41 | credentials={ 42 | 'username': 'XXXXXX', 43 | 'password': 'XXXXXX', 44 | }, 45 | output_directory=os.path.join(data_dir, 'pge'), 46 | ) 47 | 48 | 49 | Interactive shell: 50 | ================== 51 | 52 | From the interactive shell, type: `self.run()` to start the scraper. 53 | 54 | """ 55 | 56 | import re 57 | import datetime 58 | import logging 59 | import os 60 | import urllib.parse 61 | 62 | from selenium.webdriver.common.by import By 63 | from selenium.webdriver.support.ui import Select 64 | from selenium.webdriver.common.keys import Keys 65 | 66 | from . import scrape_lib 67 | 68 | logger = logging.getLogger('pge_scrape') 69 | 70 | netloc_re = r'^([^\.@]+\.)*pge.com$' 71 | 72 | 73 | def check_url(url): 74 | result = urllib.parse.urlparse(url) 75 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 76 | raise RuntimeError('Reached invalid URL: %r' % url) 77 | 78 | 79 | def find_first_matching_date(lines, date_format): 80 | for line in lines: 81 | try: 82 | return datetime.datetime.strptime(line, date_format).date() 83 | except: 84 | pass 85 | return None 86 | 87 | 88 | class Scraper(scrape_lib.Scraper): 89 | def __init__(self, credentials, output_directory, **kwargs): 90 | super().__init__(**kwargs) 91 | self.credentials = credentials 92 | self.output_directory = output_directory 93 | self.logged_in = False 94 | 95 | def check_after_wait(self): 96 | check_url(self.driver.current_url) 97 | 98 | def login(self): 99 | if self.logged_in: 100 | return 101 | logger.info('Initiating log in') 102 | self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page') 103 | 104 | (username, password), = self.wait_and_return( 105 | self.find_username_and_password_in_any_frame) 106 | logger.info('Entering username and password') 107 | username.send_keys(self.credentials['username']) 108 | password.send_keys(self.credentials['password']) 109 | with self.wait_for_page_load(): 110 | password.send_keys(Keys.ENTER) 111 | logger.info('Logged in') 112 | self.logged_in = True 113 | 114 | def get_output_path(self, output_dir, date): 115 | journal_date_format = '%Y-%m-%d' 116 | return os.path.join( 117 | output_dir, '%s.bill.pdf' % (date.strftime(journal_date_format))) 118 | 119 | def process_download(self, download_result, output_dir): 120 | logger.info('Got download: %s' % download_result[0]) 121 | m = re.fullmatch(r'.*custbill([0-9]{2})([0-9]{2})([0-9]{4})\.pdf', 122 | download_result[0]) 123 | if not m: 124 | logger.error('Failed to determine date from downloaded file: %s' % 125 | download_result[0]) 126 | return True 127 | else: 128 | date = datetime.date( 129 | year=int(m.group(3)), month=int(m.group(1)), day=int( 130 | m.group(2))) 131 | new_path = self.get_output_path(output_dir, date) 132 | if os.path.exists(new_path): 133 | logger.info('Skipping duplicate download: %s', date) 134 | return False 135 | tmp_path = new_path + '.tmp' 136 | with open(tmp_path, 'wb') as f: 137 | download_data = download_result[1] 138 | f.write(download_data) 139 | os.rename(tmp_path, new_path) 140 | logger.info("Wrote %s", new_path) 141 | return True 142 | 143 | def get_bills(self, output_dir): 144 | logger.info('Looking for download link') 145 | (bills_link, ), = self.wait_and_return( 146 | lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2')) 147 | scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2) 148 | links, = self.wait_and_return( 149 | lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF") 150 | ) 151 | 152 | def do_download(link): 153 | scrape_lib.retry(lambda: self.click(link), retry_delay=2) 154 | logger.info('Waiting for download') 155 | download_result, = self.wait_and_return(self.get_downloaded_file) 156 | return self.process_download(download_result, output_dir) 157 | 158 | for link in links: 159 | if not do_download(link): 160 | break 161 | 162 | def run(self): 163 | self.login() 164 | if not os.path.exists(self.output_directory): 165 | os.makedirs(self.output_directory) 166 | self.get_bills(self.output_directory) 167 | 168 | 169 | def run(**kwargs): 170 | scrape_lib.run_with_scraper(Scraper, **kwargs) 171 | 172 | 173 | def interactive(**kwargs): 174 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 175 | -------------------------------------------------------------------------------- /finance_dl/comcast.py: -------------------------------------------------------------------------------- 1 | """Retrieves Comcast PDF bills. 2 | 3 | These PDF bills can be parsed by extracting the text using `pdftotext`. 4 | 5 | This uses the `selenium` Python package in conjunction with `chromedriver` to 6 | scrape the Stockplanconnect website. 7 | 8 | Configuration: 9 | ============== 10 | 11 | The following keys may be specified as part of the configuration dict: 12 | 13 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 14 | keys. 15 | 16 | - `output_directory`: Required. Must be a `str` that specifies the path on the 17 | local filesystem where the bills will be saved. If the directory does not 18 | exist, it will be created. 19 | 20 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 21 | path to a persistent Chrome browser profile to use. This should be a path 22 | used solely for this single configuration; it should not refer to your normal 23 | browser profile. If not specified, a fresh temporary profile will be used 24 | each time. 25 | 26 | Output format: 27 | ============== 28 | 29 | Each statement is saved to the `output_directory` with a name like: 30 | 31 | 2017-11-28.bill.pdf 32 | 33 | The date corresponds to the "Bill Date" of the bill. 34 | 35 | Example: 36 | ======== 37 | 38 | def CONFIG_comcast(): 39 | return dict( 40 | module='finance_dl.comcast', 41 | credentials={ 42 | 'username': 'XXXXXX', 43 | 'password': 'XXXXXX', 44 | }, 45 | output_directory=os.path.join(data_dir, 'comcast'), 46 | ) 47 | 48 | 49 | Interactive shell: 50 | ================== 51 | 52 | From the interactive shell, type: `self.run()` to start the scraper. 53 | 54 | """ 55 | 56 | import re 57 | import datetime 58 | import time 59 | import logging 60 | import os 61 | import urllib.parse 62 | 63 | import dateutil.parser 64 | from selenium.webdriver.common.by import By 65 | from selenium.webdriver.support.ui import Select 66 | from selenium.webdriver.common.keys import Keys 67 | 68 | from . import scrape_lib 69 | 70 | logger = logging.getLogger('comcast_scrape') 71 | 72 | netloc_re = r'^([^\.@]+\.)*(comcast.com|xfinity.com|comcast.net)$' 73 | 74 | 75 | def check_url(url): 76 | result = urllib.parse.urlparse(url) 77 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 78 | raise RuntimeError('Reached invalid URL: %r' % url) 79 | 80 | 81 | def find_first_matching_date(lines, date_format): 82 | for line in lines: 83 | try: 84 | return datetime.datetime.strptime(line, date_format).date() 85 | except: 86 | pass 87 | return None 88 | 89 | 90 | class Scraper(scrape_lib.Scraper): 91 | def __init__(self, credentials, output_directory, **kwargs): 92 | super().__init__(**kwargs) 93 | self.credentials = credentials 94 | self.output_directory = output_directory 95 | self.logged_in = False 96 | 97 | def check_after_wait(self): 98 | check_url(self.driver.current_url) 99 | 100 | def login(self): 101 | if self.logged_in: 102 | return 103 | logger.info('Initiating log in') 104 | self.driver.get('https://customer.xfinity.com/Secure/MyAccount/') 105 | 106 | (username, password), = self.wait_and_return( 107 | self.find_username_and_password_in_any_frame) 108 | logger.info('Entering username and password') 109 | username.send_keys(self.credentials['username']) 110 | password.send_keys(self.credentials['password']) 111 | with self.wait_for_page_load(): 112 | password.send_keys(Keys.ENTER) 113 | logger.info('Logged in') 114 | self.logged_in = True 115 | 116 | def get_output_path(self, output_dir, date): 117 | journal_date_format = '%Y-%m-%d' 118 | return os.path.join( 119 | output_dir, '%s.bill.pdf' % (date.strftime(journal_date_format))) 120 | 121 | def process_download(self, download_result, output_dir, date): 122 | logger.info('Got download: %s' % download_result[0]) 123 | new_path = self.get_output_path(output_dir, date) 124 | if os.path.exists(new_path): 125 | logger.info('Skipping duplicate download: %s', new_path) 126 | return 127 | tmp_path = new_path + '.tmp' 128 | with open(tmp_path, 'wb') as f: 129 | f.write(download_result[1]) 130 | os.rename(tmp_path, new_path) 131 | logger.info("Wrote %s" % new_path) 132 | 133 | def get_bills(self, output_dir): 134 | logger.info('Looking for bills link') 135 | 136 | def get_bills_link(): 137 | (bills_link, ), = self.wait_and_return( 138 | lambda: self.find_visible_elements_by_descendant_partial_text('View Bill History', 'span')) 139 | return bills_link 140 | 141 | bills_link = get_bills_link() 142 | 143 | for partial_text in ['Check it out', 'Continue To My Account']: 144 | try: 145 | continue_link, = self.find_visible_elements_by_descendant_partial_text( 146 | partial_text, 'button') 147 | continue_link.click() 148 | time.sleep(3.0) # wait for overlay to go away 149 | except: 150 | pass 151 | bills_link = get_bills_link() 152 | 153 | self.driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE) 154 | bills_link.click() 155 | 156 | def get_links(): 157 | links, = self.wait_and_return( 158 | lambda: self.driver.find_elements(By.XPATH, '//a[starts-with(text(), "View PDF")]')) 159 | return links 160 | 161 | links = get_links() 162 | time.sleep(5.0) 163 | links = get_links() 164 | 165 | for link in links: 166 | if not link.is_displayed(): 167 | continue 168 | cur_el = link 169 | bill_date = None 170 | while True: 171 | parent = cur_el.find_element_by_xpath('..') 172 | if parent == cur_el: 173 | break 174 | try: 175 | bill_date = dateutil.parser.parse(parent.text, fuzzy=True) 176 | break 177 | except: 178 | cur_el = parent 179 | continue 180 | if bill_date is None: 181 | print('skipping link due to no bill date') 182 | continue 183 | bill_date = bill_date + datetime.timedelta(days=1) 184 | new_path = self.get_output_path(output_dir, bill_date) 185 | if os.path.exists(new_path): 186 | logger.info( 187 | "Skipping already-downloaded bill for %s" % bill_date) 188 | else: 189 | logger.info('Attempting download of bill for %s' % bill_date) 190 | link.click() 191 | logger.info('Waiting for download') 192 | download_result, = self.wait_and_return( 193 | self.get_downloaded_file) 194 | self.process_download(download_result, output_dir, bill_date) 195 | 196 | def run(self): 197 | self.login() 198 | if not os.path.exists(self.output_directory): 199 | os.makedirs(self.output_directory) 200 | self.get_bills(self.output_directory) 201 | 202 | 203 | def run(**kwargs): 204 | scrape_lib.run_with_scraper(Scraper, **kwargs) 205 | 206 | 207 | def interactive(**kwargs): 208 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 209 | -------------------------------------------------------------------------------- /finance_dl/update.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import argparse 3 | import importlib 4 | import subprocess 5 | import concurrent.futures 6 | import sys 7 | import threading 8 | import os 9 | import time 10 | 11 | config_prefix = 'CONFIG_' 12 | 13 | 14 | def _format_duration(count) -> str: 15 | seconds_per_day = 24 * 60 * 60 16 | if count >= seconds_per_day: 17 | return '%d days' % (count // seconds_per_day) 18 | return '%d minutes' % (count // 60) 19 | 20 | 21 | class CommandBase: 22 | def __init__(self, args): 23 | self.args = args 24 | self.config_module = importlib.import_module(args.config_module) 25 | self.log_dir = args.log_dir 26 | 27 | def get_all_configs(self) -> List[str]: 28 | names = [] 29 | for key in vars(self.config_module): 30 | if key.startswith(config_prefix): 31 | names.append(key[len(config_prefix):]) 32 | return names 33 | 34 | def get_last_update_path(self, config_name: str) -> str: 35 | return os.path.join(self.log_dir, config_name + '.lastupdate') 36 | 37 | def get_log_path(self, config_name: str) -> str: 38 | return os.path.join(self.log_dir, config_name + '.txt') 39 | 40 | def get_last_update_time(self, config_name: str) -> Optional[float]: 41 | try: 42 | statinfo = os.stat(self.get_last_update_path(config_name)) 43 | return statinfo.st_mtime 44 | except OSError: 45 | return None 46 | 47 | 48 | class StatusCommand(CommandBase): 49 | def __init__(self, args): 50 | super().__init__(args) 51 | 52 | def __call__(self): 53 | cur_time = time.time() 54 | config_names = self.get_all_configs() 55 | max_name_len = max(len(x) for x in config_names) 56 | update_times = [(name, self.get_last_update_time(name)) 57 | for name in config_names] 58 | 59 | def get_time_sort_key(mtime: Optional[int]) -> float: 60 | if mtime is None: 61 | return float('-inf') 62 | return mtime 63 | 64 | update_times.sort(key=lambda x: get_time_sort_key(x[1])) 65 | for name, mtime in update_times: 66 | if mtime is not None: 67 | update_string = '%s (%s ago)' % (time.strftime( 68 | '%c', 69 | time.localtime(mtime)), _format_duration(cur_time - mtime)) 70 | else: 71 | update_string = 'NEVER' 72 | print('%*s: %s' % (max_name_len, name, update_string)) 73 | 74 | 75 | class Updater(CommandBase): 76 | def __init__(self, args): 77 | super().__init__(args) 78 | force = self.args.force 79 | cur_time = time.time() 80 | configs = self.args.config 81 | if self.args.all: 82 | configs = self.get_all_configs() 83 | configs_to_update = [] 84 | for config in configs: 85 | mtime = self.get_last_update_time(config) 86 | if not force and mtime is not None and ( 87 | cur_time - mtime) < 24 * 60 * 60: 88 | print('%s: SKIPPING (updated %s ago)' % 89 | (config, _format_duration(cur_time - mtime))) 90 | continue 91 | configs_to_update.append(config) 92 | self.configs_to_update = configs_to_update 93 | self._lock = threading.Lock() 94 | self.configs_completed = 0 95 | 96 | def print_message(self, config, start_time, message, completed=False): 97 | with self._lock: 98 | if completed: 99 | self.configs_completed += 1 100 | print('[%d/%d] %s [%.fs elapsed] %s' % 101 | (self.configs_completed, len(self.configs_to_update), config, 102 | time.time() - start_time, message.rstrip())) 103 | 104 | def run_config(self, config): 105 | start_time = time.time() 106 | self.print_message(config, start_time, 'starting') 107 | success = False 108 | termination_message = 'SUCCESS' 109 | try: 110 | with open( 111 | self.get_log_path(config), 'w', encoding='utf-8', 112 | newline='') as f: 113 | process = subprocess.Popen( 114 | [ 115 | sys.executable, '-m', 'finance_dl.cli', 116 | '--config-module', self.args.config_module, '-c', config 117 | ], 118 | stdout=subprocess.PIPE, 119 | stderr=subprocess.STDOUT, 120 | bufsize=1, 121 | universal_newlines=True, 122 | ) 123 | for line in process.stdout: 124 | self.print_message(config, start_time, line.rstrip()) 125 | f.write(line) 126 | process.wait() 127 | if process.returncode == 0: 128 | success = True 129 | with open( 130 | self.get_last_update_path(config), 131 | 'w', 132 | encoding='utf-8', 133 | newline='') as f: 134 | pass 135 | else: 136 | termination_message = 'FAILED with return code %d' % (process.returncode) 137 | 138 | except: 139 | termination_message = 'FAILED with exception' 140 | self.print_message(config, start_time, termination_message, 141 | completed=True) 142 | 143 | def __call__(self): 144 | with concurrent.futures.ThreadPoolExecutor( 145 | max_workers=self.args.parallelism) as executor: 146 | for config in self.configs_to_update: 147 | executor.submit(self.run_config, config) 148 | 149 | 150 | def main(): 151 | ap = argparse.ArgumentParser( 152 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 153 | ap.add_argument('--config-module', type=str, required=True, 154 | help='Python module defining CONFIG_ functions.') 155 | ap.add_argument('--log-dir', type=str, required=True, 156 | help='Directory containing log files.') 157 | 158 | subparsers = ap.add_subparsers(dest='command') 159 | subparsers.required = True 160 | 161 | ap_status = subparsers.add_parser( 162 | 'status', 163 | help='Show update status.', 164 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 165 | ) 166 | ap_status.set_defaults(command_class=StatusCommand) 167 | 168 | ap_update = subparsers.add_parser( 169 | 'update', 170 | help='Update configurations.', 171 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 172 | ) 173 | ap_update.add_argument('config', nargs='*', type=str, default=[], 174 | help='Configuration to update') 175 | ap_update.add_argument( 176 | '-f', '--force', action='store_true', 177 | help='Force update even if the configuration has already run recently.' 178 | ) 179 | ap_update.add_argument('-a', '--all', action='store_true', 180 | help='Update all configurations.') 181 | ap_update.add_argument( 182 | '-p', '--parallelism', type=int, default=4, 183 | help='Maximum number of configurations to update in parallel.') 184 | ap_update.set_defaults(command_class=Updater) 185 | 186 | args = ap.parse_args() 187 | 188 | command = args.command_class(args) 189 | command() 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /finance_dl/stockplanconnect.py: -------------------------------------------------------------------------------- 1 | """Retrieves PDF documents from https://www.stockplanconnect.com. 2 | 3 | These PDF documents can be parsed by extracting the text using `pdftotext`. 4 | 5 | This uses the `selenium` Python package in conjunction with `chromedriver` to 6 | scrape the Stockplanconnect website. 7 | 8 | Configuration: 9 | ============== 10 | 11 | The following keys may be specified as part of the configuration dict: 12 | 13 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 14 | keys. 15 | 16 | - `output_directory`: Required. Must be a `str` that specifies the path on the 17 | local filesystem where the documents will be saved. If the directory does not 18 | exist, it will be created. 19 | 20 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 21 | path to a persistent Chrome browser profile to use. This should be a path 22 | used solely for this single configuration; it should not refer to your normal 23 | browser profile. If not specified, a fresh temporary profile will be used 24 | each time. 25 | 26 | - `headless`: Must be set to `False` currently, as this scraper does not work 27 | properly when run with a headless browser. 28 | 29 | Output format: 30 | ============== 31 | 32 | Each document is saved to the `output_directory` with a name like: 33 | 34 | 2017-02-09.Restricted_Units.Trade_Confirmations.Confirmation.pdf 35 | 2017-08-30.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf 36 | 2017-12-31.Other.Tax_Documents.Form_1099.pdf 37 | 38 | If there are multiple documents of the same type on the same date, a number is 39 | appended, e.g.: 40 | 41 | 2018-05-31.Restricted_Units.Trade_Confirmations.Release_Confirmation.pdf 42 | 2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.2.pdf 43 | 2018-06-28.Restricted_Units.Trade_Confirmations.Release_Confirmation.3.pdf 44 | 45 | If for some reason this data source does not work and you wish to manually 46 | download documents, make sure to use the same name numbering scheme: the first 47 | document listed with a given date, document type, and name should be given no 48 | numeric suffix, the second such document should be given a suffix of `.2`, the 49 | third `.3`, etc. 50 | 51 | Example: 52 | ======== 53 | 54 | def CONFIG_stockplanconnect(): 55 | return dict( 56 | module='finance_dl.stockplanconnect', 57 | credentials={ 58 | 'username': 'XXXXXX', 59 | 'password': 'XXXXXX', 60 | }, 61 | output_directory=os.path.join(data_dir, 'stockplanconnect'), 62 | headless=False, 63 | ) 64 | 65 | Interactive shell: 66 | ================== 67 | 68 | From the interactive shell, type: `self.run()` to start the scraper. 69 | 70 | """ 71 | 72 | import urllib.parse 73 | import re 74 | import collections 75 | import time 76 | import logging 77 | import os 78 | 79 | import dateutil.parser 80 | from selenium.webdriver.common.by import By 81 | from selenium.webdriver.support.ui import Select 82 | from selenium.webdriver.common.keys import Keys 83 | 84 | from finance_dl import scrape_lib 85 | 86 | logger = logging.getLogger('scraper') 87 | 88 | netloc_re = r'^([^\.@]+\.)*stockplanconnect.com|([^\.@]+\.)*morganstanley.com$' 89 | 90 | 91 | def check_url(url): 92 | result = urllib.parse.urlparse(url) 93 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 94 | raise RuntimeError('Reached invalid URL: %r' % url) 95 | 96 | 97 | class Scraper(scrape_lib.Scraper): 98 | def __init__(self, credentials, output_directory, **kwargs): 99 | super().__init__(**kwargs) 100 | self.credentials = credentials 101 | self.output_directory = output_directory 102 | 103 | def check_after_wait(self): 104 | check_url(self.driver.current_url) 105 | 106 | def login(self): 107 | logger.info('Initiating log in') 108 | self.driver.get('https://www.stockplanconnect.com') 109 | (username, password), = self.wait_and_return( 110 | self.find_username_and_password_in_any_frame) 111 | time.sleep(2.0) 112 | username.click() 113 | time.sleep(2.0) 114 | logger.info('Entering username') 115 | username.send_keys(self.credentials['username']) 116 | username.click() 117 | time.sleep(2.0) 118 | logger.info('Entering password') 119 | password.click() 120 | time.sleep(1.0) 121 | password.send_keys(self.credentials['password']) 122 | time.sleep(1.0) 123 | with self.wait_for_page_load(): 124 | password.send_keys(Keys.ENTER) 125 | logger.info('Logged in') 126 | 127 | def get_output_path(self, parts, index): 128 | journal_date_format = '%Y-%m-%d' 129 | date = dateutil.parser.parse(parts[0]) 130 | 131 | def sanitize(x): 132 | x = x.replace(' ', '_') 133 | x = re.sub('[^a-zA-Z0-9-_.]', '', x) 134 | return x 135 | 136 | suffix = '' 137 | if index != 1: 138 | suffix = '.%d' % index 139 | 140 | return os.path.join( 141 | self.output_directory, 142 | '%s.%s.%s.%s%s.pdf' % (date.strftime(journal_date_format), 143 | sanitize(parts[1]), sanitize(parts[2]), 144 | sanitize(parts[3]), suffix)) 145 | 146 | def get_documents(self): 147 | logger.info('Looking for documents link') 148 | documents, = self.wait_and_locate((By.PARTIAL_LINK_TEXT, 'Documents')) 149 | scrape_lib.retry(lambda: self.click(documents), num_tries=3, 150 | retry_delay=5) 151 | self.download_documents() 152 | 153 | def download_documents(self): 154 | logger.info('Looking for PDF links') 155 | links, = self.wait_and_return( 156 | lambda: self.driver.find_elements(By.LINK_TEXT, 'PDF')) 157 | links = list(links)[::-1] 158 | previously_seen_parts = collections.Counter() 159 | for link in links: 160 | cur_el = link 161 | output_path = None 162 | while True: 163 | try: 164 | parent = cur_el.find_element_by_xpath('..') 165 | except: 166 | break 167 | if parent == cur_el: 168 | break 169 | full_text = parent.text 170 | parts = full_text.split('\n') 171 | if len(parts) == 5: 172 | try: 173 | key = tuple(parts) 174 | index = previously_seen_parts[key] + 1 175 | previously_seen_parts[key] += 1 176 | output_path = self.get_output_path(parts, index) 177 | break 178 | except: 179 | logger.info('Failed to determine output filename %r', 180 | parts) 181 | break 182 | else: 183 | cur_el = parent 184 | if output_path is None: 185 | logger.info('skipping link due to no date') 186 | continue 187 | if os.path.exists(output_path): 188 | logger.info('skipping existing file: %r', output_path) 189 | continue 190 | 191 | self.click(link) 192 | logger.info('Waiting for download') 193 | download_result, = self.wait_and_return(self.get_downloaded_file) 194 | 195 | if not os.path.exists(self.output_directory): 196 | os.makedirs(self.output_directory) 197 | 198 | tmp_path = output_path + '.tmp' 199 | with open(tmp_path, 'wb') as f: 200 | download_data = download_result[1] 201 | f.write(download_data) 202 | os.rename(tmp_path, output_path) 203 | logger.info("Wrote %s", output_path) 204 | 205 | def run(self): 206 | self.login() 207 | self.get_documents() 208 | 209 | 210 | def run(**kwargs): 211 | scrape_lib.run_with_scraper(Scraper, **kwargs) 212 | 213 | 214 | def interactive(**kwargs): 215 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 216 | -------------------------------------------------------------------------------- /finance_dl/waveapps.py: -------------------------------------------------------------------------------- 1 | """Retrieves receipt images and extracted data from waveapps.com. 2 | 3 | This uses the waveapps API (https://docs.waveapps.io/) to retrieve the data 4 | directly. 5 | 6 | 7 | Configuration: 8 | ============== 9 | 10 | The following keys may be specified as part of the configuration dict: 11 | 12 | - `credentials`: Required. Must be a `dict` with a `'token'` key specifying a 13 | Full Access token. To generate a token, first sign in to https://waveapps.com 14 | and then visit the "Manage Applications" page: 15 | https://developer.waveapps.com/hc/en-us/articles/360019762711 16 | 17 | Choose "Create an application", then after creating an application choose 18 | "Create token". 19 | 20 | Alternatively, if you have a valid OAuth2 client id, instead of the `'token'` 21 | field you may specify `'client_id'`, `'username'`, and `'password'` fields. 22 | Signing in with a Google account is not supported. 23 | 24 | - `output_directory`: Required. Must be a `str` that specifies the path on the 25 | local filesystem where the output will be written. If the directory does not 26 | exist, it will be created. 27 | 28 | - `use_business_directory`: Optional. If specified, must be a `bool`. If `True`, 29 | create a subdirectory in `output_directory` to write the output for each 30 | business ID. 31 | 32 | - `active_only`: Optional. If specified, must be a `bool`. If `True`, do not 33 | download deleted receipts. 34 | 35 | Output format: 36 | ============== 37 | 38 | This module downloads receipts for all businesses that are accessible using the 39 | specified `credentials`. The receipts for each business is stored in the 40 | sub-directory of the specified `output_directory` with a name equal to the 41 | business name. If the sub-directory does not exist, it will be created. 42 | 43 | Within each business sub-directory, for each receipt, the JSON data as returned 44 | by the API is saved as `.json`. The JSON data contains at least the 45 | following fields: 46 | 47 | - `id`: The unique receipt identifier, matching the `` portion of 48 | the filename. 49 | 50 | - `date`: The date. 51 | 52 | - `merchant`: Merchant name 53 | 54 | - `note`: Optional note. 55 | 56 | - `total`: Total amount. 57 | 58 | - `currency_code`: The currency code. 59 | 60 | The corresponding receipt images are saved in full resolution as: 61 | `.jpeg`, and if there are additional images, as 62 | `.01.jpeg`, `.02.jpeg`, etc. 63 | 64 | Example: 65 | ======== 66 | 67 | def CONFIG_waveapps(): 68 | return dict( 69 | module='finance_dl.waveapps', 70 | credentials={ 71 | 'token': 'XXXXXX', 72 | }, 73 | output_directory=os.path.join(data_dir, 'waveapps'), 74 | ) 75 | 76 | """ 77 | 78 | from typing import List, Any 79 | import contextlib 80 | import logging 81 | import json 82 | import os 83 | 84 | import requests 85 | from atomicwrites import atomic_write 86 | 87 | logger = logging.getLogger('waveapps') 88 | 89 | 90 | class WaveScraper(object): 91 | def __init__(self, credentials: dict, output_directory: str, 92 | use_business_directory: bool = False, 93 | active_only: bool = False, headless=None): 94 | del headless 95 | self.credentials = credentials 96 | self.output_directory = output_directory 97 | self.use_business_directory = use_business_directory 98 | self.active_only = active_only 99 | 100 | def get_oauth2_token(self): 101 | if 'token' in self.credentials: 102 | logger.info('Using specified token') 103 | self._oauth_token = { 104 | 'token_type': 'Bearer', 105 | 'access_token': self.credentials['token'] 106 | } 107 | else: 108 | logger.info('Obtaining oauth2 token') 109 | oauth_url = 'https://api.waveapps.com/oauth2/token/' 110 | response = requests.post( 111 | oauth_url, files={ 112 | k: (None, v, None, {}) 113 | for k, v in [ 114 | ('client_id', self.credentials['client_id']), 115 | ('username', self.credentials['username']), 116 | ('grant_type', 'password'), 117 | ('password', self.credentials['password']), 118 | ] 119 | }) 120 | response.raise_for_status() 121 | self._oauth_token = response.json() 122 | self._authenticated_headers = { 123 | 'authorization': 124 | self._oauth_token['token_type'] + ' ' + 125 | self._oauth_token['access_token'], 126 | } 127 | 128 | def get_businesses(self): 129 | logger.info('Getting list of businesses') 130 | response = requests.get( 131 | 'https://api.waveapps.com/businesses/?include_personal=true', 132 | headers=dict(self._authenticated_headers, 133 | accept='application/json'), 134 | ) 135 | response.raise_for_status() 136 | result = response.json() 137 | logger.info('Got %d businesses', len(result)) 138 | return result 139 | 140 | def get_receipts(self, business_id: str): 141 | logger.info('Getting receipts for business %s', business_id) 142 | receipts = [] # type: List[Any] 143 | response = requests.get( 144 | 'https://api.waveapps.com/businesses/' + business_id + 145 | '/receipts/?active_only=' + 146 | (self.active_only and 'true' or 'false'), 147 | headers=dict(self._authenticated_headers, 148 | accept='application/json'), 149 | ) 150 | response.raise_for_status() 151 | result = response.json() 152 | cur_list = result['results'] 153 | logger.info('Received %d receipts', len(cur_list)) 154 | receipts.extend(cur_list) 155 | return receipts 156 | 157 | def save_receipts(self, receipts: List[Any], output_directory: str = None): 158 | if not output_directory: 159 | output_directory = self.output_directory 160 | if not os.path.exists(output_directory): 161 | os.makedirs(output_directory) 162 | for receipt in receipts: 163 | output_prefix = os.path.join(output_directory, 164 | str(receipt['id'])) 165 | json_path = output_prefix + '.json' 166 | for image_i, image in enumerate(receipt['images']): 167 | image_url = image['file'] 168 | if image_i == 0: 169 | image_path = '%s.jpg' % (output_prefix, ) 170 | else: 171 | image_path = '%s.%02d.jpg' % (output_prefix, image_i) 172 | if not os.path.exists(image_path): 173 | logger.info('Downloading receipt image %s', image_url) 174 | r = requests.get(image_url) 175 | r.raise_for_status() 176 | data = r.content 177 | with atomic_write(image_path, mode='wb') as f: 178 | f.write(data) 179 | with atomic_write( 180 | json_path, 181 | mode='w', 182 | overwrite=True, 183 | encoding='utf-8', 184 | newline='\n') as f: 185 | json.dump(receipt, f, sort_keys=True, indent=' ') 186 | 187 | def run(self): 188 | self.get_oauth2_token() 189 | output_directory = self.output_directory 190 | businesses = self.get_businesses() 191 | for business in businesses: 192 | business_id = business['id'] 193 | receipts = self.get_receipts(business_id) 194 | if receipts and self.use_business_directory: 195 | output_directory = os.path.join(self.output_directory, 196 | business_id) 197 | self.save_receipts(receipts, output_directory) 198 | 199 | 200 | def run(**kwargs): 201 | scraper = WaveScraper(**kwargs) 202 | scraper.run() 203 | 204 | 205 | @contextlib.contextmanager 206 | def interactive(**kwargs): 207 | scraper = WaveScraper(**kwargs) 208 | kwargs['scraper'] = scraper 209 | yield kwargs 210 | -------------------------------------------------------------------------------- /finance_dl/ultipro_google.py: -------------------------------------------------------------------------------- 1 | """Retrieves Google employee payroll statements from Ultipro in PDF format. 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Ultipro website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the PDF pay statements will be written. If the 16 | directory does not exist, it will be created. 17 | 18 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 19 | path to a persistent Chrome browser profile to use. This should be a path 20 | used solely for this single configuration; it should not refer to your normal 21 | browser profile. If not specified, a fresh temporary profile will be used 22 | each time. 23 | 24 | - `headless`: Optional. If specified, must be a `bool`. Defaults to `True`. 25 | Indicates whether to use a headless browser. Scraping appears to be more 26 | reliable when this is set to `True`. 27 | 28 | Output format: 29 | ============== 30 | 31 | Each pay statement is downloaded in PDF format and saved to the 32 | `output_directory` with a filename of `%Y-%m-%d.statement-.pdf`, where 33 | `` is the document number in the "Pay History" list. In some cases, due to 34 | a bug of some sort, the document number in the "Pay History" list may differ 35 | from the document number included in the actual document. Such discrepancies 36 | are handled by the `beancount_import.source.ultipro_google` module. 37 | 38 | Example: 39 | ======== 40 | 41 | def CONFIG_google_payroll(): 42 | return dict( 43 | module='finance_dl.ultipro_google', 44 | credentials={ 45 | 'username': 'XXXXXX', 46 | 'password': 'XXXXXX', 47 | }, 48 | output_directory=os.path.join(data_dir, 'documents', 'Income', 49 | 'Google'), 50 | 51 | # profile_dir is optional but recommended. 52 | profile_dir=os.path.join(profile_dir, 'google_payroll'), 53 | 54 | # Recommended for greater reliability. 55 | headless=False, 56 | ) 57 | 58 | Interactive shell: 59 | ================== 60 | 61 | From the interactive shell, type: `self.run()` to start the scraper. 62 | 63 | """ 64 | 65 | import datetime 66 | import logging 67 | import os 68 | import re 69 | import urllib.parse 70 | from selenium.webdriver.common.by import By 71 | from selenium.webdriver.support.ui import Select 72 | from selenium.webdriver.common.keys import Keys 73 | from atomicwrites import atomic_write 74 | from . import scrape_lib, google_login 75 | 76 | logger = logging.getLogger('ultipro') 77 | 78 | output_date_format = '%Y-%m-%d' 79 | 80 | 81 | class Scraper(scrape_lib.Scraper): 82 | def __init__(self, 83 | credentials, 84 | output_directory, 85 | login_url='https://googlemypay.ultipro.com', 86 | netloc_re=r'^([^\.@]+\.)*(ultipro.com|google.com)$', 87 | **kwargs): 88 | super().__init__(**kwargs) 89 | self.credentials = credentials 90 | self.login_url = login_url 91 | self.netloc_re = netloc_re 92 | self.output_directory = output_directory 93 | 94 | def check_url(self, url): 95 | result = urllib.parse.urlparse(url) 96 | if result.scheme != 'https' or not re.fullmatch(self.netloc_re, 97 | result.netloc): 98 | raise RuntimeError('Reached invalid URL: %r' % url) 99 | 100 | def check_after_wait(self): 101 | self.check_url(self.driver.current_url) 102 | 103 | def login(self): 104 | google_login.login(self, self.login_url) 105 | 106 | def get_next_statement(self, 107 | existing_statements=set(), 108 | downloaded_statements=set()): 109 | pay_history, = self.wait_and_return( 110 | lambda: self.find_element_in_any_frame( 111 | By.PARTIAL_LINK_TEXT, "Pay History", only_displayed=True)) 112 | pay_history.click() 113 | 114 | def get_statement_table(): 115 | try: 116 | for table in self.find_elements_in_any_frame( 117 | By.TAG_NAME, 'table', only_displayed=True): 118 | headings = [ 119 | x.text.strip() 120 | for x in table.find_elements_by_xpath('thead/tr/th') 121 | ] 122 | if 'Pay Date' in headings and 'Document Number' in headings: 123 | return table 124 | except: 125 | import traceback 126 | traceback.print_exc() 127 | 128 | table, = self.wait_and_return(get_statement_table) 129 | date_format = '%m/%d/%Y' 130 | for row in table.find_elements_by_xpath('tbody/tr'): 131 | row_text = [ 132 | x.text.strip() for x in row.find_elements_by_tag_name('td') 133 | ] 134 | row_text = [x for x in row_text if x] 135 | pay_date = row_text[0] 136 | document_number = row_text[1] 137 | assert re.fullmatch('[0-9A-Z]+', document_number), document_number 138 | pay_date = datetime.datetime.strptime(pay_date, date_format).date() 139 | document_str = 'Document %r : %r' % (pay_date, document_number) 140 | if (pay_date, document_number) in existing_statements: 141 | logger.info(' Found in existing') 142 | continue 143 | if (pay_date, document_number) not in downloaded_statements: 144 | logger.info('%s: Downloading', document_str) 145 | link = row.find_element_by_tag_name('a') 146 | link.click() 147 | download_link, = self.wait_and_return( 148 | lambda: self.find_element_in_any_frame( 149 | By.XPATH, 150 | '//input[@type="image" and contains(@title, "download")]' 151 | )) 152 | download_link.click() 153 | logger.info('%s: Waiting to get download', document_str) 154 | download_result, = self.wait_and_return( 155 | self.get_downloaded_file) 156 | name, data = download_result 157 | if len(data) < 5000: 158 | raise RuntimeError( 159 | 'Downloaded file size is invalid: %d' % len(data)) 160 | output_name = '%s.statement-%s.pdf' % ( 161 | pay_date.strftime('%Y-%m-%d'), document_number) 162 | output_path = os.path.join(self.output_directory, output_name) 163 | with atomic_write(output_path, mode='wb') as f: 164 | f.write(data) 165 | downloaded_statements.add((pay_date, document_number)) 166 | return True 167 | else: 168 | logger.info('%s: Just downloaded', document_str) 169 | return False 170 | 171 | def get_existing_statements(self): 172 | existing_statements = set() 173 | if os.path.exists(self.output_directory): 174 | for name in os.listdir(self.output_directory): 175 | m = re.fullmatch( 176 | r'([0-9]{4})-([0-9]{2})-([0-9]{2})\.statement-([0-9A-Z]+)\.pdf', 177 | name) 178 | if m is not None: 179 | date = datetime.date( 180 | year=int(m.group(1)), 181 | month=int(m.group(2)), 182 | day=int(m.group(3))) 183 | statement_number = m.group(4) 184 | existing_statements.add((date, statement_number)) 185 | logger.info('Found existing statement %r %r', date, 186 | statement_number) 187 | else: 188 | logger.warning( 189 | 'Ignoring extraneous file in existing statement directory: %r', 190 | os.path.join(self.output_directory, name)) 191 | return existing_statements 192 | 193 | def download_statements(self): 194 | if not os.path.exists(self.output_directory): 195 | os.makedirs(self.output_directory) 196 | existing_statements = self.get_existing_statements() 197 | downloaded_statements = set() 198 | while self.get_next_statement( 199 | existing_statements=existing_statements, 200 | downloaded_statements=downloaded_statements, 201 | ): 202 | pass 203 | 204 | def run(self): 205 | self.login() 206 | self.download_statements() 207 | 208 | 209 | def run(**kwargs): 210 | scrape_lib.run_with_scraper(Scraper, **kwargs) 211 | 212 | 213 | def interactive(**kwargs): 214 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 215 | -------------------------------------------------------------------------------- /finance_dl/paypal.py: -------------------------------------------------------------------------------- 1 | """Retrieves Paypal activity from https://paypal.com. 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Google purchases website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the output will be written. If the directory does not 16 | exist, it will be created. 17 | 18 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 19 | path to a persistent Chrome browser profile to use. This should be a path 20 | used solely for this single configuration; it should not refer to your normal 21 | browser profile. If not specified, a fresh temporary profile will be used 22 | each time. 23 | 24 | Output format: 25 | ============== 26 | 27 | For each Paypal transaction, two files are written to the specified 28 | `output_directory`: `.json` contains a JSON representation of the 29 | transaction as returned by the Paypal server, and `.html` contains an HTML 30 | representation. 31 | 32 | For invoices, instead the files `.pdf` and `.invoice.json` are written 33 | to the specified `output_directory`. 34 | 35 | Interactive shell: 36 | ================== 37 | 38 | From the interactive shell, type: `self.run()` to start the scraper. 39 | 40 | """ 41 | 42 | from typing import List, Any 43 | import urllib.parse 44 | import re 45 | import json 46 | import logging 47 | import datetime 48 | import os 49 | from selenium.webdriver.common.by import By 50 | from selenium.webdriver.support.ui import Select 51 | from selenium.webdriver.common.keys import Keys 52 | from selenium.common.exceptions import NoSuchElementException 53 | import jsonschema 54 | from atomicwrites import atomic_write 55 | from . import scrape_lib 56 | from . import google_login 57 | 58 | logger = logging.getLogger('paypal') 59 | 60 | netloc_re = r'^([^\.@]+\.)*paypal.com$' 61 | 62 | transaction_list_schema = { 63 | '#schema': 'http://json-schema.org/draft-07/schema#', 64 | 'description': 'JSON schema for the transaction list response.', 65 | 'type': 'object', 66 | 'required': ['data'], 67 | 'properties': { 68 | 'data': { 69 | 'type': 'object', 70 | 'required': ['activity'], 71 | 'properties': { 72 | 'activity': { 73 | 'type': 'object', 74 | 'required': ['transactions'], 75 | 'properties': { 76 | 'transactions': { 77 | 'type': 'array', 78 | 'items': { 79 | 'type': 'object', 80 | 'required': ['id'], 81 | 'properties': { 82 | 'id': { 83 | 'type': 'string', 84 | 'pattern': r'^[A-Za-z0-9\-]+$', 85 | }, 86 | }, 87 | } 88 | }, 89 | }, 90 | }, 91 | }, 92 | }, 93 | }, 94 | } 95 | 96 | transaction_details_schema = { 97 | '#schema': 'http://json-schema.org/draft-07/schema#', 98 | 'description': 'JSON schema for the transaction details response.', 99 | 'type': 'object', 100 | 'required': ['data'], 101 | 'properties': { 102 | 'data': { 103 | 'type': 'object', 104 | 'required': ['details'], 105 | 'properties': { 106 | 'details': { 107 | 'type': 'object', 108 | }, 109 | }, 110 | }, 111 | }, 112 | } 113 | 114 | 115 | def check_url(url): 116 | result = urllib.parse.urlparse(url) 117 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 118 | raise RuntimeError('Reached invalid URL: %r' % url) 119 | 120 | 121 | class Scraper(scrape_lib.Scraper): 122 | def __init__(self, credentials: dict, output_directory: str, **kwargs): 123 | super().__init__(use_seleniumrequests=True, **kwargs) 124 | self.credentials = credentials 125 | self.output_directory = output_directory 126 | self.logged_in = False 127 | 128 | def check_after_wait(self): 129 | check_url(self.driver.current_url) 130 | 131 | def login(self): 132 | if self.logged_in: 133 | return 134 | 135 | self.driver.get('https://www.paypal.com/us/signin') 136 | logger.info('Finding username field') 137 | username, = self.wait_and_locate((By.XPATH, '//input[@type="email"]'), 138 | only_displayed=True) 139 | logger.info('Entering username') 140 | username.send_keys(self.credentials['username']) 141 | username.send_keys(Keys.ENTER) 142 | logger.info('Finding password field') 143 | password, = self.wait_and_locate( 144 | (By.XPATH, '//input[@type="password"]'), only_displayed=True) 145 | logger.info('Entering password') 146 | password.send_keys(self.credentials['password']) 147 | with self.wait_for_page_load(): 148 | password.send_keys(Keys.ENTER) 149 | logger.info('Logged in') 150 | self.logged_in = True 151 | self.csrf_token = None 152 | 153 | def make_json_request(self, url): 154 | return self.driver.request( 155 | 'GET', url, headers={ 156 | 'x-csrf-token': self.get_csrf_token(), 157 | 'accept': 'application/json, text/javascript, */*; q=0.01', 158 | 'x-requested-with': 'XMLHttpRequest' 159 | }) 160 | 161 | def get_csrf_token(self): 162 | if self.csrf_token is not None: return self.csrf_token 163 | logging.info('Getting CSRF token') 164 | self.driver.get('https://www.paypal.com/myaccount/transactions/') 165 | # Get CSRF token 166 | body_element, = self.wait_and_locate((By.XPATH, 167 | '//body[@data-token!=""]')) 168 | self.csrf_token = body_element.get_attribute('data-token') 169 | return self.csrf_token 170 | 171 | def get_transaction_list(self): 172 | end_date = datetime.datetime.now().date() + datetime.timedelta(days=2) 173 | start_date = end_date - datetime.timedelta(days=365 * 10) 174 | date_format = '%Y-%m-%d' 175 | logging.info('Getting transaction list') 176 | url = ( 177 | 'https://www.paypal.com/myaccount/transactions/filter?' 178 | 'transactionType=ALL&nextPageToken=&freeTextSearch=&isClearFreeTextSearch=false&' 179 | 'isClearFilterSelection=false&isClientSideFiltering=false&selectedCurrency=ALL&' 180 | 'startDate=%s&endDate=%s' % (start_date.strftime(date_format), 181 | end_date.strftime(date_format))) 182 | resp = self.make_json_request(url) 183 | resp.raise_for_status() 184 | j = resp.json() 185 | jsonschema.validate(j, transaction_list_schema) 186 | return j['data']['activity']['transactions'] 187 | 188 | def save_transactions(self): 189 | transaction_list = self.get_transaction_list() 190 | logging.info('Got %d transactions', len(transaction_list)) 191 | for transaction in transaction_list: 192 | transaction_id = transaction['id'] 193 | output_prefix = os.path.join(self.output_directory, transaction_id) 194 | if transaction_id.startswith('INV'): 195 | pdf_path = output_prefix + '.pdf' 196 | if not os.path.exists(pdf_path): 197 | invoice_url = ( 198 | 'https://www.paypal.com/invoice/payerView/detailsInternal/' 199 | + transaction_id + '?printPdfMode=true') 200 | logging.info('Retrieving PDF %s', invoice_url) 201 | r = self.driver.request('GET', invoice_url) 202 | r.raise_for_status() 203 | data = r.content 204 | with atomic_write(pdf_path, mode='wb') as f: 205 | f.write(data) 206 | invoice_json_path = output_prefix + '.invoice.json' 207 | if not os.path.exists(invoice_json_path): 208 | with atomic_write( 209 | invoice_json_path, 210 | mode='w', 211 | encoding='utf-8', 212 | newline='\n') as f: 213 | f.write(json.dumps(transaction, indent=' ')) 214 | continue 215 | details_url = ( 216 | 'https://www.paypal.com/myaccount/transactions/details/' + 217 | transaction_id) 218 | inline_details_url = ( 219 | 'https://www.paypal.com/myaccount/transactions/details/inline/' 220 | + transaction_id) 221 | html_path = output_prefix + '.html' 222 | json_path = output_prefix + '.json' 223 | if not os.path.exists(html_path): 224 | logging.info('Retrieving HTML %s', details_url) 225 | html_resp = self.driver.request('GET', details_url) 226 | html_resp.raise_for_status() 227 | with atomic_write( 228 | html_path, mode='w', encoding='utf-8', 229 | newline='\n') as f: 230 | # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8 231 | f.write('\ufeff' + html_resp.text) 232 | if not os.path.exists(json_path): 233 | logging.info('Retrieving JSON %s', inline_details_url) 234 | json_resp = self.make_json_request(inline_details_url) 235 | json_resp.raise_for_status() 236 | j = json_resp.json() 237 | jsonschema.validate(j, transaction_details_schema) 238 | with atomic_write(json_path, mode='wb') as f: 239 | f.write( 240 | json.dumps(j['data']['details'], indent=' ').encode()) 241 | 242 | def run(self): 243 | if not os.path.exists(self.output_directory): 244 | os.makedirs(self.output_directory) 245 | self.login() 246 | self.save_transactions() 247 | 248 | 249 | def run(**kwargs): 250 | scrape_lib.run_with_scraper(Scraper, **kwargs) 251 | 252 | 253 | def interactive(**kwargs): 254 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 255 | -------------------------------------------------------------------------------- /finance_dl/amazon.py: -------------------------------------------------------------------------------- 1 | """Retrieves order invoices from Amazon. 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Amazon website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the output will be written. If the directory does not 16 | exist, it will be created. 17 | 18 | - `amazon_domain`: Optional. Specifies the Amazon domain from which to download 19 | orders. Must be one of `'.com'` or `'.co.cuk'`. Defaults to `'.com'`. 20 | 21 | - `regular`: Optional. Must be a `bool`. If `True` (the default), download regular orders. 22 | 23 | - `digital`: Optional. Must be a `bool` or `None`. If `True`, download digital 24 | orders. Defaults to `None`, which is equivalent to `True` for 25 | `amazon_domain=".com"`, and `False` for `amazon_domain=".co.uk"`. 26 | 27 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 28 | path to a persistent Chrome browser profile to use. This should be a path 29 | used solely for this single configuration; it should not refer to your normal 30 | browser profile. If not specified, a fresh temporary profile will be used 31 | each time. 32 | 33 | Output format: 34 | ============== 35 | 36 | Each regular or digital order invoice is written in HTML format to the specified 37 | `output_directory` using the naming scheme `.html`, 38 | e.g. `166-7926740-5141621.html` for a regular order invoice and 39 | `D56-5204779-4181560.html` for a digital order invoice. 40 | 41 | Example: 42 | ======== 43 | 44 | def CONFIG_amazon(): 45 | return dict( 46 | module='finance_dl.amazon', 47 | credentials={ 48 | 'username': 'XXXXXX', 49 | 'password': 'XXXXXX', 50 | }, 51 | output_directory=os.path.join(data_dir, 'amazon'), 52 | # profile_dir is optional. 53 | profile_dir=os.path.join(profile_dir, 'amazon'), 54 | ) 55 | 56 | Interactive shell: 57 | ================== 58 | 59 | From the interactive shell, type: `self.run()` to start the scraper. 60 | 61 | """ 62 | 63 | import urllib.parse 64 | import re 65 | import logging 66 | import os 67 | from selenium.webdriver.common.by import By 68 | from selenium.webdriver.support.ui import Select 69 | from selenium.webdriver.common.keys import Keys 70 | from atomicwrites import atomic_write 71 | from . import scrape_lib 72 | 73 | logger = logging.getLogger('amazon_scrape') 74 | 75 | 76 | class Domain: 77 | COM = 'com' 78 | CO_UK = 'co.uk' 79 | 80 | 81 | class Scraper(scrape_lib.Scraper): 82 | def __init__(self, credentials, output_directory, amazon_domain=Domain.COM, regular=True, digital=None, **kwargs): 83 | super().__init__(**kwargs) 84 | default_digital = True if amazon_domain == Domain.COM else False 85 | self.credentials = credentials 86 | self.output_directory = output_directory 87 | self.logged_in = False 88 | self.amazon_domain = amazon_domain 89 | self.regular = regular 90 | self.digital = digital if digital is not None else default_digital 91 | 92 | def check_url(self, url): 93 | netloc_re = r'^([^\.@]+\.)*amazon.' + self.amazon_domain + '$' 94 | result = urllib.parse.urlparse(url) 95 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 96 | raise RuntimeError('Reached invalid URL: %r' % url) 97 | 98 | def check_after_wait(self): 99 | self.check_url(self.driver.current_url) 100 | 101 | def login(self): 102 | logger.info('Initiating log in') 103 | self.driver.get('https://www.amazon.' + self.amazon_domain) 104 | if self.logged_in: 105 | return 106 | 107 | sign_out_links = self.find_elements_by_descendant_partial_text('Sign Out', 'a') 108 | if len(sign_out_links) > 0: 109 | logger.info('You must be already logged in!') 110 | self.logged_in = True 111 | return 112 | 113 | logger.info('Looking for sign-in link') 114 | sign_in_links, = self.wait_and_return( 115 | lambda: self.find_visible_elements_by_descendant_partial_text('Sign in', 'a') 116 | ) 117 | 118 | self.click(sign_in_links[0]) 119 | logger.info('Looking for username link') 120 | (username, ), = self.wait_and_return( 121 | lambda: self.find_visible_elements(By.XPATH, '//input[@type="email"]') 122 | ) 123 | username.send_keys(self.credentials['username']) 124 | 125 | logger.info('Looking for password link') 126 | (password, ), = self.wait_and_return( 127 | lambda: self.find_visible_elements(By.XPATH, '//input[@type="password"]') 128 | ) 129 | password.send_keys(self.credentials['password']) 130 | 131 | logger.info('Looking for "remember me" checkbox') 132 | (rememberMe, ) = self.wait_and_return( 133 | lambda: self.find_visible_elements(By.XPATH, '//input[@name="rememberMe"]')[0] 134 | ) 135 | rememberMe.click() 136 | 137 | password.send_keys(Keys.ENTER) 138 | 139 | logger.info('Logged in') 140 | self.logged_in = True 141 | 142 | def get_invoice_path(self, order_id): 143 | return os.path.join(self.output_directory, order_id + '.html') 144 | 145 | def get_orders(self, regular=True, digital=True): 146 | invoice_hrefs = [] 147 | order_ids_seen = set() 148 | 149 | def get_invoice_urls(): 150 | initial_iteration = True 151 | while True: 152 | 153 | def invoice_finder(): 154 | return self.driver.find_elements(By.XPATH, '//a[contains(@href, "orderID=")]') 155 | 156 | if initial_iteration: 157 | invoices = invoice_finder() 158 | else: 159 | invoices, = self.wait_and_return(invoice_finder) 160 | initial_iteration = False 161 | 162 | order_ids = set() 163 | for invoice_link in invoices: 164 | href = invoice_link.get_attribute('href') 165 | m = re.match('.*[&?]orderID=((?:D)?[0-9\\-]+)(?:&.*)?$', href) 166 | if m is None: 167 | raise RuntimeError( 168 | 'Failed to parse order ID from href %r' % (href, )) 169 | order_id = m[1] 170 | if order_id in order_ids: 171 | continue 172 | order_ids.add(order_id) 173 | invoice_path = self.get_invoice_path(order_id) 174 | if order_id in order_ids_seen: 175 | logger.info('Skipping already-seen order id: %r', 176 | order_id) 177 | continue 178 | if os.path.exists(invoice_path): 179 | logger.info('Skipping already-downloaded invoice: %r', 180 | order_id) 181 | continue 182 | print_url = 'https://www.amazon.%s/gp/css/summary/print.html?ie=UTF8&orderID=%s' % ( 183 | self.amazon_domain, order_id) 184 | invoice_hrefs.append((print_url, order_id)) 185 | order_ids_seen.add(order_id) 186 | 187 | # Find next link 188 | next_links = self.find_elements_by_descendant_text_match( 189 | '. = "Next"', 'a', only_displayed=True) 190 | if len(next_links) == 0: 191 | logger.info('Found no more pages') 192 | break 193 | if len(next_links) != 1: 194 | raise RuntimeError('More than one next link found') 195 | with self.wait_for_page_load(): 196 | self.click(next_links[0]) 197 | 198 | def retrieve_all_order_groups(): 199 | order_select_index = 0 200 | 201 | while True: 202 | (order_filter,), = self.wait_and_return( 203 | lambda: self.find_visible_elements(By.XPATH, '//select[@name="orderFilter"]') 204 | ) 205 | order_select = Select(order_filter) 206 | num_options = len(order_select.options) 207 | if order_select_index >= num_options: 208 | break 209 | option_text = order_select.options[ 210 | order_select_index].text.strip() 211 | if option_text != 'Archived Orders': 212 | logger.info('Retrieving order group: %r', option_text) 213 | with self.wait_for_page_load(): 214 | order_select.select_by_index(order_select_index) 215 | get_invoice_urls() 216 | 217 | order_select_index += 1 218 | if order_select_index >= num_options: 219 | break 220 | 221 | if regular: 222 | orders_text = "Your Orders" if self.amazon_domain == Domain.CO_UK else "Orders" 223 | # on co.uk, orders link is hidden behind the menu, hence not directly clickable 224 | (orders_link,), = self.wait_and_return( 225 | lambda: self.find_elements_by_descendant_text_match('. = "{}"'.format(orders_text), 'a', only_displayed=False) 226 | ) 227 | link = orders_link.get_attribute('href') 228 | scrape_lib.retry(lambda: self.driver.get(link), retry_delay=2) 229 | 230 | retrieve_all_order_groups() 231 | 232 | if digital: 233 | (digital_orders_link,), = self.wait_and_return( 234 | lambda: self.find_elements_by_descendant_text_match('contains(., "Digital Orders")', 'a', only_displayed=True) 235 | ) 236 | scrape_lib.retry(lambda: self.click(digital_orders_link), 237 | retry_delay=2) 238 | retrieve_all_order_groups() 239 | 240 | self.retrieve_invoices(invoice_hrefs) 241 | 242 | def retrieve_invoices(self, invoice_hrefs): 243 | for href, order_id in invoice_hrefs: 244 | invoice_path = self.get_invoice_path(order_id) 245 | 246 | logger.info('Downloading invoice for order %r', order_id) 247 | with self.wait_for_page_load(): 248 | self.driver.get(href) 249 | 250 | # For digital orders, Amazon dynamically generates some of the information. 251 | # Wait until it is all generated. 252 | def get_source(): 253 | source = self.driver.page_source 254 | if 'Grand Total:' in source: 255 | return source 256 | return None 257 | 258 | page_source, = self.wait_and_return(get_source) 259 | if order_id not in page_source: 260 | raise ValueError('Failed to retrieve information for order %r' 261 | % (order_id, )) 262 | with atomic_write( 263 | invoice_path, mode='w', encoding='utf-8', 264 | newline='\n') as f: 265 | # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8 266 | f.write('\ufeff' + page_source) 267 | logger.info(' Wrote %s', invoice_path) 268 | 269 | def run(self): 270 | self.login() 271 | if not os.path.exists(self.output_directory): 272 | os.makedirs(self.output_directory) 273 | self.get_orders(regular=self.regular, digital=self.digital) 274 | 275 | 276 | def run(**kwargs): 277 | scrape_lib.run_with_scraper(Scraper, **kwargs) 278 | 279 | 280 | def interactive(**kwargs): 281 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 282 | -------------------------------------------------------------------------------- /finance_dl/venmo.py: -------------------------------------------------------------------------------- 1 | """Retrieves transaction and balance information from Venmo. 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Venmo website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the output will be written. If the directory does not 16 | exist, it will be created. 17 | 18 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 19 | path to a persistent Chrome browser profile to use. This should be a path 20 | used solely for this single configuration; it should not refer to your normal 21 | browser profile. If not specified, a fresh temporary profile will be used 22 | each time. It is highly recommended to specify a `profile_dir` to avoid 23 | having to manually enter a multi-factor authentication code each time. 24 | 25 | - `earliest_history_date`: Optional. If specified, must be a `datetime.date` 26 | specifying the earliest UTC date for which to retrieve data. 27 | 28 | - `max_history_days`: Optional. If `earliest_history_date` is not specified, 29 | this must be a positive `int` specifying the number of days of history to 30 | retrieve, starting from the previous UTC day. Defaults to `365*4`. If 31 | `earliest_history_date` is specified, `max_history_days` has no effect. 32 | 33 | Output format: 34 | ============== 35 | 36 | The retrieved transaction and balance information is merged into the 37 | `transactions.csv` and `balances.csv` files within the specified 38 | `output_directory`. Note that any existing transaction and balance information 39 | in those files is not overwritten; instead, new information is merged in without 40 | introducing duplicates. 41 | 42 | The `transactions.csv` file is in the same CSV download format provided directly 43 | from the Venmo website, and has the format: 44 | 45 | " ID","Datetime","Type","Status","Note","From","To","Amount (total)","Amount (fee)","Funding Source","Destination" 46 | 47 | The `balances.csv` file is created from scraping the HTML and has the format: 48 | 49 | "Start Date","End Date","Start Balance","End Balance" 50 | 51 | Example: 52 | ======== 53 | 54 | def CONFIG_venmo(): 55 | return dict( 56 | module='finance_dl.venmo', 57 | credentials={ 58 | 'username': 'XXXXXX', 59 | 'password': 'XXXXXX', 60 | }, 61 | output_directory=os.path.join(data_dir, 'venmo'), 62 | 63 | # profile_dir is optional but highly recommended to avoid having to 64 | # enter multi-factor authentication code each time. 65 | profile_dir=os.path.join(profile_dir, 'venmo'), 66 | ) 67 | 68 | Interactive shell: 69 | ================== 70 | 71 | From the interactive shell, type: `self.run()` to start the scraper. 72 | 73 | """ 74 | 75 | import io 76 | import csv 77 | import urllib.parse 78 | import re 79 | import dateutil.parser 80 | import datetime 81 | import logging 82 | import os 83 | from selenium.webdriver.common.by import By 84 | from selenium.common.exceptions import NoSuchElementException 85 | from selenium.webdriver.support.ui import Select 86 | from selenium.webdriver.common.keys import Keys 87 | 88 | from . import scrape_lib 89 | from . import csv_merge 90 | 91 | logger = logging.getLogger('venmo_scrape') 92 | 93 | netloc_re = r'^([^\.@]+\.)*venmo.com$' 94 | 95 | 96 | def check_url(url): 97 | result = urllib.parse.urlparse(url) 98 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 99 | raise RuntimeError('Reached invalid URL: %r' % url) 100 | 101 | 102 | balance_field_names = [ 103 | 'Start Date', 'End Date', 'Start Balance', 'End Balance' 104 | ] 105 | 106 | standard_date_format = '%Y-%m-%d' 107 | 108 | 109 | def parse_csv_date(x): 110 | return dateutil.parser.parse( 111 | x, ignoretz=True).replace(tzinfo=datetime.timezone.utc) 112 | 113 | 114 | class Scraper(scrape_lib.Scraper): 115 | def __init__(self, credentials, output_directory, 116 | earliest_history_date=None, max_history_days=365 * 4, 117 | **kwargs): 118 | """ 119 | @param earliest_history_date: Earliest UTC date for which to retrieve 120 | transactions and balance information. 121 | 122 | @param max_history_days: Number of days of history to retrieve, starting 123 | from the previous UTC day, if `earliest_history_date` is not 124 | specified. 125 | """ 126 | super().__init__(**kwargs) 127 | self.credentials = credentials 128 | self.output_directory = output_directory 129 | if not os.path.exists(self.output_directory): 130 | os.makedirs(self.output_directory) 131 | self.transactions_path = os.path.join(output_directory, 132 | 'transactions.csv') 133 | self.balances_path = os.path.join(output_directory, 'balances.csv') 134 | self.latest_history_date = ( 135 | datetime.datetime.now() - datetime.timedelta(days=1)).astimezone( 136 | datetime.timezone.utc).date() 137 | if earliest_history_date is None: 138 | self.earliest_history_date = self.latest_history_date - datetime.timedelta( 139 | days=max_history_days) 140 | else: 141 | self.earliest_history_date = dateutil.parser.parse( 142 | earliest_history_date).date() 143 | self.logged_in = False 144 | 145 | def check_after_wait(self): 146 | check_url(self.driver.current_url) 147 | 148 | def login(self): 149 | if self.logged_in: 150 | return 151 | logger.info('Initiating log in') 152 | self.driver.get('https://venmo.com/account/sign-in') 153 | 154 | (username, password), = self.wait_and_return( 155 | self.find_username_and_password_in_any_frame) 156 | logger.info('Entering username and password') 157 | username.send_keys(self.credentials['username']) 158 | password.send_keys(self.credentials['password']) 159 | with self.wait_for_page_load(): 160 | password.send_keys(Keys.ENTER) 161 | logger.info('Logged in') 162 | self.logged_in = True 163 | 164 | def goto_statement(self, start_date, end_date): 165 | url_date_format = '%m-%d-%Y' 166 | with self.wait_for_page_load(): 167 | self.driver.get( 168 | 'https://venmo.com/account/statement?end=%s&start=%s' % 169 | (end_date.strftime(url_date_format), 170 | start_date.strftime(url_date_format))) 171 | 172 | def download_csv(self): 173 | logger.info('Looking for CSV link') 174 | download_button, = self.wait_and_locate( 175 | (By.XPATH, '//button[text() = "Download CSV"]')) 176 | self.click(download_button) 177 | logger.info('Waiting for CSV download') 178 | download_result, = self.wait_and_return(self.get_downloaded_file) 179 | logger.info('Got CSV download') 180 | return download_result[1] 181 | 182 | def get_balance(self, balance_type): 183 | try: 184 | balance_node = self.driver.find_element( 185 | By.XPATH, '//*[@class="%s"]/child::*[@class="balance-amt"]' % 186 | balance_type) 187 | return balance_node.text 188 | except NoSuchElementException: 189 | return None 190 | 191 | def get_balances(self): 192 | def maybe_get_balance(): 193 | start_balance = self.get_balance('start-balance') 194 | end_balance = self.get_balance('end-balance') 195 | if start_balance is not None and end_balance is not None: 196 | return (start_balance, end_balance) 197 | try: 198 | error_node = self.driver.find_element( 199 | By.XPATH, '//*[@class="account-statement-error"]') 200 | error_text = error_node.text 201 | logging.info('Saw error text: %s', error_text) 202 | if error_text.startswith('Loading'): 203 | return None 204 | return ('unknown', 'unknown') 205 | except NoSuchElementException: 206 | return None 207 | 208 | result, = self.wait_and_return(maybe_get_balance) 209 | return result 210 | 211 | def write_csv(self, csv_result): 212 | csv_reader = csv.DictReader( 213 | io.StringIO(csv_result.decode(), newline='')) 214 | field_names = csv_reader.fieldnames 215 | rows = list(csv_reader) 216 | 217 | def get_sort_key(row): 218 | return parse_csv_date(row['Datetime']).timestamp() 219 | 220 | transactions_file = os.path.join(self.output_directory, 221 | 'transactions.csv') 222 | csv_merge.merge_into_file(filename=transactions_file, 223 | field_names=field_names, data=rows, 224 | sort_by=get_sort_key) 225 | 226 | def get_existing_balances(self): 227 | if not os.path.exists(self.balances_path): 228 | return [] 229 | with open(self.balances_path, 'r', newline='', encoding='utf-8') as f: 230 | csv_reader = csv.DictReader(f) 231 | assert csv_reader.fieldnames == balance_field_names 232 | return list(csv_reader) 233 | 234 | def get_start_date(self): 235 | existing_balances = self.get_existing_balances() 236 | if not existing_balances: 237 | return self.earliest_history_date 238 | return max( 239 | datetime.datetime.strptime(row['End Date'], standard_date_format) 240 | .date() for row in existing_balances) + datetime.timedelta(days=1) 241 | 242 | def fetch_statement(self, start_date, end_date): 243 | logging.info('Fetching statement: [%s, %s]', 244 | start_date.strftime(standard_date_format), 245 | end_date.strftime(standard_date_format)) 246 | self.goto_statement(start_date, end_date) 247 | start_balance, end_balance = self.get_balances() 248 | # Write transactions before balance information, to make sure if an error occurs we will retry next time 249 | if (start_balance, end_balance) != ('unknown', 'unknown'): 250 | csv_data = self.download_csv() 251 | self.write_csv(csv_data) 252 | else: 253 | logging.info( 254 | 'Skipping fetching transactions CSV because current period has no transactions' 255 | ) 256 | csv_merge.merge_into_file( 257 | filename=self.balances_path, 258 | field_names=balance_field_names, 259 | data=[{ 260 | 'Start Date': start_date.strftime(standard_date_format), 261 | 'End Date': end_date.strftime(standard_date_format), 262 | 'Start Balance': start_balance, 263 | 'End Balance': end_balance, 264 | }], 265 | sort_by=lambda row: (row['Start Date'], row['End Date']), 266 | ) 267 | 268 | def fetch_history(self): 269 | 270 | start_date = self.get_start_date() 271 | logging.info('Fetching history starting from %s', 272 | start_date.strftime('%Y-%m-%d')) 273 | 274 | while start_date <= self.latest_history_date: 275 | end_date = min(self.latest_history_date, 276 | start_date + datetime.timedelta(days=89)) 277 | self.fetch_statement(start_date, end_date) 278 | start_date = end_date + datetime.timedelta(days=1) 279 | 280 | def run(self): 281 | self.login() 282 | self.fetch_history() 283 | 284 | 285 | def run(**kwargs): 286 | scrape_lib.run_with_scraper(Scraper, **kwargs) 287 | 288 | 289 | def interactive(**kwargs): 290 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 291 | -------------------------------------------------------------------------------- /finance_dl/healthequity.py: -------------------------------------------------------------------------------- 1 | """Retrieves transaction and balance information from HealthEquity. 2 | 3 | This uses the `selenium` Python package in conjunction with `chromedriver` to 4 | scrape the Venmo website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the output will be written. If the directory does not 16 | exist, it will be created. For compatibility with `beancount-import`, the 17 | last component of the `output_directory` should be your HealthEquity account 18 | number. 19 | 20 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 21 | path to a persistent Chrome browser profile to use. This should be a path 22 | used solely for this single configuration; it should not refer to your normal 23 | browser profile. If not specified, a fresh temporary profile will be used 24 | each time. It is highly recommended to specify a `profile_dir` to avoid 25 | having to manually enter a multi-factor authentication code each time. 26 | 27 | Output format: 28 | ============== 29 | 30 | Cash transactions relating to contributions, distributions, and other are saved 31 | to `cash-transactions-contribution.csv`, `cash-transactions-distribution.csv`, 32 | and `cash-transactions-other.csv`, respectively, with the following fields: 33 | 34 | "Date","Transaction","Amount","Cash Balance" 35 | 36 | Investment transactions are saved to `investment-transactions.csv` with the 37 | following fields: 38 | 39 | "Date","Fund","Category","Description","Price","Amount","Shares","Total Shares","Total Value" 40 | 41 | Investment holdings are saved to files named like 42 | `YYYY-MM-ddTHHMMSSZZZZ.balances.csv`, where the date and time are the date and 43 | time at which the scraper was run. 44 | 45 | Example: 46 | ======== 47 | 48 | def CONFIG_healthequity(): 49 | return dict( 50 | module='finance_dl.healthequity', 51 | credentials={ 52 | 'username': 'XXXXXX', 53 | 'password': 'XXXXXX', 54 | }, 55 | # Use your HealthEquity account number as the last directory component. 56 | output_directory=os.path.join(data_dir, 'healthequity', '1234567'), 57 | 58 | # profile_dir is optional but highly recommended to avoid having to 59 | # enter multi-factor authentication code each time. 60 | profile_dir=os.path.join(profile_dir, 'healthequity'), 61 | ) 62 | 63 | Interactive shell: 64 | ================== 65 | 66 | From the interactive shell, type: `self.run()` to start the scraper. 67 | 68 | """ 69 | 70 | import urllib.parse 71 | import re 72 | import datetime 73 | import time 74 | import logging 75 | import os 76 | import bs4 77 | from selenium.webdriver.common.by import By 78 | from selenium.webdriver.support.ui import Select 79 | from selenium.webdriver.common.keys import Keys 80 | from . import scrape_lib 81 | from . import csv_merge 82 | 83 | logger = logging.getLogger('healthequity_scrape') 84 | 85 | netloc_re = r'^([^\.@]+\.)*healthequity.com$' 86 | 87 | 88 | def check_url(url): 89 | result = urllib.parse.urlparse(url) 90 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 91 | raise RuntimeError('Reached invalid URL: %r' % url) 92 | 93 | 94 | def find_first_matching_date(lines, date_format): 95 | for line in lines: 96 | try: 97 | return datetime.datetime.strptime(line, date_format).date() 98 | except: 99 | pass 100 | return None 101 | 102 | 103 | FUND_ACTIVITY_HEADERS = [ 104 | 'Fund', 'Name', 'Shares (#)', 'Closing Price', 'Closing Value' 105 | ] 106 | 107 | 108 | def write_balances(data, path): 109 | rows = [] 110 | for entry in data: 111 | keys = [x[0] for x in entry] 112 | if keys == FUND_ACTIVITY_HEADERS: 113 | row_values = dict(entry) 114 | row_values['Fund'] = row_values['Fund'].strip().split()[0] 115 | rows.append(row_values) 116 | csv_merge.write_csv(FUND_ACTIVITY_HEADERS, rows, path) 117 | 118 | 119 | def write_fund_activity(raw_transactions_data, path): 120 | input_date_format = '%m/%d/%Y' 121 | output_date_format = '%Y-%m-%d' 122 | soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml') 123 | headers = [ 124 | 'Date', 'Fund', 'Category', 'Description', 'Price', 'Amount', 'Shares', 125 | 'Total Shares', 'Total Value' 126 | ] 127 | rows = [] 128 | for row in soup.find_all('tr'): 129 | cells = [str(x.text).strip() for x in row.find_all('td')] 130 | while cells and not cells[-1].strip(): 131 | del cells[-1] 132 | if len(cells) == 1: 133 | continue 134 | assert len(cells) == len(headers) 135 | if cells == headers: 136 | continue 137 | row_values = dict(zip(headers, cells)) 138 | row_values['Date'] = datetime.datetime.strptime( 139 | row_values['Date'], input_date_format).strftime(output_date_format) 140 | rows.append(row_values) 141 | csv_merge.merge_into_file(filename=path, field_names=headers, data=rows, 142 | sort_by=lambda x: x['Date']) 143 | 144 | 145 | def write_transactions(raw_transactions_data, path): 146 | input_date_format = '%m/%d/%Y' 147 | output_date_format = '%Y-%m-%d' 148 | soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml') 149 | headers = ['Date', 'Transaction', 'Amount', 'HSA Cash Balance'] 150 | output_headers = ['Date', 'Transaction', 'Amount', 'Cash Balance'] 151 | rows = [] 152 | for row in soup.find_all('tr'): 153 | cells = [str(x.text).strip() for x in row.find_all('td')] 154 | while cells and not cells[-1].strip(): 155 | del cells[-1] 156 | if len(cells) <= 1: 157 | continue 158 | if cells[0] == 'TOTAL': 159 | continue 160 | assert len(cells) == len(headers) 161 | if cells == headers: 162 | continue 163 | row_values = dict(zip(headers, cells)) 164 | # Sanitize whitespace in description 165 | row_values['Transaction'] = ' '.join(row_values['Transaction'].split()) 166 | row_values['Cash Balance'] = row_values.pop('HSA Cash Balance') 167 | 168 | # Sanitize date_str 169 | date_str = row_values['Date'] 170 | date_str = re.sub('\\(Available .*\\)', '', date_str) 171 | 172 | row_values['Date'] = datetime.datetime.strptime( 173 | date_str, input_date_format).strftime(output_date_format) 174 | rows.append(row_values) 175 | rows.reverse() 176 | csv_merge.merge_into_file(filename=path, field_names=output_headers, 177 | data=rows, sort_by=lambda x: x['Date']) 178 | 179 | 180 | class Scraper(scrape_lib.Scraper): 181 | def __init__(self, credentials, output_directory, **kwargs): 182 | super().__init__(**kwargs) 183 | self.credentials = credentials 184 | self.output_directory = output_directory 185 | self.logged_in = False 186 | 187 | def check_after_wait(self): 188 | check_url(self.driver.current_url) 189 | 190 | def login(self): 191 | if self.logged_in: 192 | return 193 | logger.info('Initiating log in') 194 | self.driver.get('https://my.healthequity.com/') 195 | 196 | (username, password), = self.wait_and_return( 197 | self.find_username_and_password_in_any_frame) 198 | logger.info('Entering username and password') 199 | username.send_keys(self.credentials['username']) 200 | password.send_keys(self.credentials['password']) 201 | with self.wait_for_page_load(): 202 | password.send_keys(Keys.ENTER) 203 | logger.info('Logged in') 204 | self.logged_in = True 205 | 206 | def download_transaction_history(self): 207 | (transactions_link, ), = self.wait_and_return( 208 | lambda: self.find_visible_elements_by_descendant_partial_text('Transaction History', 'td')) 209 | scrape_lib.retry(transactions_link.click, retry_delay=2) 210 | (date_select, ), = self.wait_and_return( 211 | lambda: self.find_visible_elements_by_descendant_partial_text('All dates', 'select')) 212 | date_select = Select(date_select) 213 | with self.wait_for_page_load(): 214 | date_select.select_by_visible_text('All dates') 215 | 216 | results = {} 217 | for transaction_type in ['Contribution', 'Distribution', 'Other']: 218 | logger.info('Retrieving transaction history of type %s', 219 | transaction_type) 220 | (type_select, ), = self.wait_and_return( 221 | lambda: self.find_visible_elements_by_descendant_partial_text('All Transaction Types', 'select')) 222 | type_select = Select(type_select) 223 | with self.wait_for_page_load(): 224 | type_select.select_by_visible_text(transaction_type) 225 | 226 | (download_link,), = self.wait_and_return( 227 | lambda: self.find_visible_elements(By.XPATH, '//input[contains(@value,"Download")]')) 228 | scrape_lib.retry(download_link.click, retry_delay=2) 229 | # (excel_link,), = self.wait_and_return( 230 | # lambda: self.find_visible_elements(By.XPATH, '//input[contains(@name,"Excel")]')) 231 | # scrape_lib.retry(excel_link.click, retry_delay=2) 232 | logger.info('Waiting for downloaded transaction history') 233 | download_result, = self.wait_and_return(self.get_downloaded_file) 234 | results[transaction_type] = download_result[1] 235 | self.driver.back() # undo selection of transaction type 236 | self.driver.refresh() 237 | 238 | self.driver.back() # undo selection of "All dates" 239 | self.driver.back() # undo selection of "Transaction history" 240 | self.driver.refresh() 241 | 242 | return results 243 | 244 | def get_investment_balance(self): 245 | headers = FUND_ACTIVITY_HEADERS 246 | (table, ), = self.wait_and_return( 247 | lambda: scrape_lib.find_table_by_headers(self, headers)) 248 | data = scrape_lib.extract_table_data(table, headers) 249 | return data 250 | 251 | def go_to_investment_history(self): 252 | logger.info('Going to investment history') 253 | self.driver.get( 254 | 'https://www.healthequity.com/Member/Investment/Desktop.aspx') 255 | 256 | def download_fund_activity(self): 257 | logger.info('Looking for fund activity link') 258 | (fund_activity_link,), = self.wait_and_return( 259 | lambda: self.find_visible_elements(By.XPATH, '//a[contains(@href, "FundActivity")]')) 260 | scrape_lib.retry(fund_activity_link.click, retry_delay=2) 261 | logger.info('Selecting date ranage for fund activity') 262 | (start_date,), = self.wait_and_return( 263 | lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "dateSelectStart")]')) 264 | start_date.clear() 265 | start_date.send_keys('01011900') 266 | logger.info('Downloading fund activity') 267 | (download_link, ), = self.wait_and_return( 268 | lambda: self.driver.find_elements_by_link_text('Download')) 269 | scrape_lib.retry(download_link.click, retry_delay=2) 270 | logger.info('Waiting for fund activity download') 271 | download_result, = self.wait_and_return(self.get_downloaded_file) 272 | return download_result[1] 273 | 274 | def download_data(self): 275 | raw_transactions = self.download_transaction_history() 276 | self.go_to_investment_history() 277 | raw_balances = self.get_investment_balance() 278 | raw_fund_activity = self.download_fund_activity() 279 | return raw_transactions, raw_balances, raw_fund_activity 280 | 281 | def run(self): 282 | self.login() 283 | if not os.path.exists(self.output_directory): 284 | os.makedirs(self.output_directory) 285 | raw_transactions, raw_balances, raw_fund_activity = self.download_data( 286 | ) 287 | write_balances( 288 | raw_balances, 289 | os.path.join( 290 | self.output_directory, 291 | '%s.balances.csv' % time.strftime('%Y-%m-%dT%H%M%S%z'))) 292 | for k, v in raw_transactions.items(): 293 | write_transactions( 294 | v, 295 | os.path.join(self.output_directory, 296 | 'cash-transactions-%s.csv' % (k.lower()))) 297 | write_fund_activity( 298 | raw_fund_activity, 299 | os.path.join(self.output_directory, 'investment-transactions.csv')) 300 | 301 | 302 | def run(**kwargs): 303 | scrape_lib.run_with_scraper(Scraper, **kwargs) 304 | 305 | 306 | def interactive(**kwargs): 307 | return scrape_lib.interact_with_scraper(Scraper, **kwargs) 308 | -------------------------------------------------------------------------------- /finance_dl/ofx.py: -------------------------------------------------------------------------------- 1 | """Retrieves OFX transaction and balance information using the OFX protocol. 2 | 3 | This module uses the `ofxclient` Python package to connect directly to financial 4 | institutions that support the OFX protocol. 5 | 6 | Refer to https://www.ofxhome.com/ to search for OFX connection information for 7 | your financial institution. 8 | 9 | Configuration: 10 | ============== 11 | 12 | The following keys may be specified as part of the configuration dict: 13 | 14 | - `ofx_params`: Required. Must be a `dict` with the following fields: 15 | - `id`: FI Id value (refer to https://www.ofxhome.com/) 16 | - `org`: FI Org value (refer to https://www.ofxhome.com/) 17 | - `url`: FI Url value (refer to https://www.ofxhome.com/) 18 | - `username`: Username for your account. 19 | - `password`: Password for your account. 20 | - `client_args`: Optional. `dict` of additional arguments to pass to the 21 | `ofxclient` library. Some banks, such as Chase, require that the OFX 22 | version be set to at least 103 and a unique client id be specified. This 23 | can be achieved using a `client_args` value of: 24 | 25 | dict( 26 | ofx_version='103', 27 | id='64f0e0bfe04f1a2d32cbddc8d30a3017', 28 | ) 29 | 30 | where `id` is a random hex string obtained from e.g.: 31 | `openssl rand -hex 16`. 32 | 33 | - `output_directory`: Required. Must be a `str` that specifies the path to the 34 | directory where OFX files are to be written. If it does not exist, it will be 35 | created. 36 | 37 | - `overlap_days`: Optional. An `int` that specifies the number of days of 38 | overlap to use when retrieving additional transactions. This is intended to 39 | reduce the chances of transactions being missed (and duplicate transactions 40 | can easily be filtered when processing the downloaded data). The default 41 | value of `2` should be suitable in almost all cases. 42 | 43 | - `min_start_date`: Optional. A `datetime.date` object specifying the earliest 44 | date at which to attempt to retrieve data. If no existing files are present 45 | for this account in the output directory, a binary search is done starting 46 | from this date to determine the first date for which the server returns a 47 | valid response. Otherwise, it is ignored. Defaults to `1990-01-01`, which 48 | should be suitable in almost all cases. 49 | 50 | - `min_days_retrieved`: Optional. An `int` specifying the minimum number of 51 | days for which the server is expected to give data. It is assumed that if a 52 | request is made starting no more than this many days from today, that all 53 | transactions will be received, and no additional request will be made. The 54 | default value of `20` should be suitable in most cases, as most servers 55 | support returning at least 30 days of transactions per request. 56 | 57 | Output format: 58 | ============== 59 | 60 | This module downloads OFX data for all accounts that are accessible using the 61 | specified `username`. The data for each account is stored in the sub-directory 62 | of the specified `output_directory` with a name equal to the account number. If 63 | the sub-directory does not exist, it will be created. 64 | 65 | Within each account sub-directory, OFX files are saved using the file naming 66 | scheme: 67 | 68 | ---.ofx 69 | 70 | where and are in YYYYMMDD format and 71 | is in seconds since epoch. The start and end dates reflect the DTSTART and 72 | DTEND fields in the OFX file. 73 | 74 | Because some institutions only allow a limited number of days of data to be 75 | retrieved in a single request, this program automatically issues repeated 76 | requests in order to download all available. 77 | 78 | If no files have already been downloaded, a binary search is used to find the 79 | oldest point at which data is available. 80 | 81 | Requests are issued repeatedly to fill any gaps in the range of downloaded data, 82 | and to extend the range towards the present date. 83 | 84 | At least one request extending up to the present date is always issued in order 85 | to ensure up-to-date information is available. 86 | 87 | Example: 88 | ======== 89 | 90 | def CONFIG_vanguard(): 91 | # To determine the correct values for `id`, `org`, and `url` for your 92 | # financial institution, search on https://www.ofxhome.com/ 93 | ofx_params = { 94 | 'id': '15103', 95 | 'org': 'Vanguard', 96 | 'url': 'https://vesnc.vanguard.com/us/OfxDirectConnectServlet', 97 | 'username': 'XXXXXX', 98 | 'password': 'XXXXXX', 99 | } 100 | return dict( 101 | module='finance_dl.ofx', 102 | ofx_params=ofx_params, 103 | output_directory=os.path.join(data_dir, 'vanguard'), 104 | ) 105 | 106 | """ 107 | 108 | import contextlib 109 | import warnings 110 | import datetime 111 | import os 112 | import time 113 | import re 114 | import logging 115 | import io 116 | 117 | from atomicwrites import atomic_write 118 | import bs4 119 | import dateutil.parser 120 | import ofxclient.institution 121 | import ofxclient 122 | 123 | from beancount.ingest.importers.ofx import parse_ofx_time, find_child 124 | 125 | warnings.filterwarnings('ignore', message='split()', module='re') 126 | 127 | logger = logging.getLogger('ofx') 128 | 129 | 130 | def sanitize_account_name(account_name: str): 131 | """Replaces any sequence of invalid characters in the account name with a dash. 132 | 133 | Returns the sanitized account name. 134 | """ 135 | if account_name == '.' or account_name == '..': 136 | raise ValueError('Invalid account name: %s' % account_name) 137 | 138 | return re.sub('[^a-z0-9A-Z.-]+', '-', account_name) 139 | 140 | 141 | def download_account_data_starting_from(account: ofxclient.account.Account, 142 | date: datetime.date): 143 | logger.info('Trying to retrieve data for %s starting at %s.', 144 | account.number, date) 145 | num_days = (datetime.date.today() - date).days 146 | return account.download(days=num_days).read().encode('ascii') 147 | 148 | 149 | def get_ofx_date_range(data: bytes): 150 | soup = bs4.BeautifulSoup(io.BytesIO(data), 'html.parser') 151 | dtstart = find_child(soup, 'dtstart', parse_ofx_time) 152 | dtend = find_child(soup, 'dtend', parse_ofx_time) 153 | if dtstart is None or dtend is None: 154 | logger.debug('Data received: %r', data) 155 | messages = soup.find_all('message') 156 | logger.info('Messages: %r', [message.text for message in messages]) 157 | return None 158 | return dtstart, dtend 159 | 160 | 161 | def get_earliest_data(account, start_date): 162 | """Try to retrieve earliest batch of account data, starting at `start_date'. 163 | 164 | Uses binary search to find the earliest point after start_date that yields a valid response. 165 | 166 | Returns ((startdate, enddate), data). 167 | """ 168 | logger.info( 169 | 'Binary searching to find earliest data available for account %s.', 170 | account.number) 171 | lower_bound = start_date 172 | upper_bound = datetime.date.today() 173 | valid_data = None 174 | valid_date_range = None 175 | while lower_bound + datetime.timedelta(days=1) < upper_bound: 176 | mid = lower_bound + datetime.timedelta(days=(upper_bound - lower_bound 177 | ).days // 2) 178 | data = download_account_data_starting_from(account, mid) 179 | date_range = get_ofx_date_range(data) 180 | if date_range is not None: 181 | upper_bound = mid 182 | valid_data = data 183 | valid_date_range = date_range 184 | else: 185 | lower_bound = mid 186 | if not valid_data: 187 | raise RuntimeError('Failed to retrieve any data for account: %s' % 188 | account.number) 189 | return valid_date_range, valid_data 190 | 191 | 192 | def save_single_account_data( 193 | account: ofxclient.account.Account, output_dir: str, overlap_days=2, 194 | min_days_retrieved=20, 195 | min_start_date: datetime.date = dateutil.parser.parse( 196 | '1990-01-01').date(), 197 | always_save=True): 198 | """Attempts to download all transactions for the specified account. 199 | 200 | :param account: The connected account for which to download data. 201 | :param output_dir: Path to filesystem directory in which to store the 202 | downloaded OFX files. It will be (recursively) created if it does not 203 | exist. Saved files will be named 204 | "---.ofx", where and 205 | are in YYYYMMDD format and is in seconds 206 | since epoch. Date ranges corresponding to existing files with this 207 | naming pattern will not be re-downloaded. 208 | :param overlap_days: The number of days of overlap to use when retrieving 209 | additional transactions. This is intended to reduce the chances of 210 | transactions being missed (and duplicate transactions can easily be 211 | filtered when processing the downloaded data). The default value should 212 | be suitable in almost all cases. 213 | :param min_days_retrieved: The minimum number of days the server is expected 214 | to give data for. This function assumes that if a request is made 215 | starting no more than this many days from today, that all transactions 216 | will be received, and no additional request will be made. The default 217 | value should be suitable in most cases, as most servers support 218 | returning at least 30 days of transactions per request. 219 | :param min_start_date: If no existing files are present in `output_dir`, a 220 | binary search is done starting from this date to determine the first 221 | date for which the server returns a valid response. If this search turns 222 | up zero transactions, then nothing is saved for this account. 223 | :param always_save: When a new OFX file is downloaded that contains an 224 | end-date that matches a previously downloaded file's end-date, this flag 225 | determines if the new file should be saved or not. By not saving it, 226 | some transactions that occur later in the day could be missed (until 227 | additional transactions arrive on later days and they get included in 228 | the next download). By always saving the file, superfluous files could 229 | be created. 230 | """ 231 | 232 | if not os.path.exists(output_dir): 233 | os.makedirs(output_dir) 234 | date_format = '%Y%m%d' 235 | 236 | date_ranges = [] 237 | 238 | # Read all OFX files in output directory. 239 | for name in os.listdir(output_dir): 240 | match = re.match(r'^([0-9]{8})-([0-9]{8})--([0-9]+)\.ofx', name) 241 | if match is not None: 242 | start_date = datetime.datetime.strptime( 243 | match.group(1), date_format).date() 244 | end_date = datetime.datetime.strptime(match.group(2), 245 | date_format).date() 246 | if start_date > end_date: 247 | logger.warning('Invalid filename: %r', 248 | os.path.join(output_dir, name)) 249 | continue 250 | date_ranges.append((start_date, end_date)) 251 | date_ranges.sort() 252 | 253 | def save_data(date_range, data): 254 | t = time.time() 255 | logger.info('Received data %s -- %s', date_range[0], date_range[1]) 256 | filename = ('%s-%s--%d.ofx' % (date_range[0].strftime(date_format), 257 | date_range[1].strftime(date_format), t)) 258 | with atomic_write(os.path.join(output_dir, filename), mode='wb') as f: 259 | f.write(data) 260 | date_ranges.append((date_range[0].date(), date_range[1].date())) 261 | date_ranges.sort() 262 | 263 | if len(date_ranges) == 0: 264 | try: 265 | date_range, data = get_earliest_data(account, 266 | start_date=min_start_date) 267 | except RuntimeError as error: 268 | logger.warning(error) 269 | return 270 | 271 | save_data(date_range, data) 272 | 273 | def retrieve_more(): 274 | # Find next gap 275 | cur_range = None 276 | for i, cur_range in enumerate(date_ranges): 277 | if (i + 1 < len(date_ranges) and 278 | cur_range[1] > date_ranges[i + 1][0]): 279 | # If end date of current range is greater than start date of 280 | # next range, then there is no gap. 281 | continue 282 | break 283 | data = download_account_data_starting_from( 284 | account, cur_range[1] - datetime.timedelta(days=overlap_days)) 285 | date_range = get_ofx_date_range(data) 286 | if date_range is None: 287 | logger.warning('Failed to retrieve newer data for account %s', 288 | account.number) 289 | return False 290 | if (date_range[1].date() - cur_range[1]).days == 0: 291 | if always_save: 292 | save_data(date_range, data) 293 | return False 294 | save_data(date_range, data) 295 | return True 296 | 297 | while True: 298 | if not retrieve_more(): 299 | break 300 | if (datetime.date.today() - date_ranges[-1][0] 301 | ).days <= min_days_retrieved: 302 | break 303 | 304 | 305 | def save_all_account_data(inst: ofxclient.institution.Institution, 306 | output_dir: str, **kwargs): 307 | """Attempts to download data for all accounts. 308 | 309 | :param inst: The institution connection. 310 | :param output_dir: The base output directory in which to store the 311 | downloaded OFX files. The data for each account is saved in a 312 | subdirectory of `output_dir`, with a name equal to the account number. 313 | :param kwargs: Additional arguments to pass to save_single_account_data. 314 | """ 315 | accounts = inst.accounts() 316 | for a in accounts: 317 | try: 318 | name = sanitize_account_name(a.number) 319 | except ValueError: 320 | logger.warning('Account number is invalid path component: %r', 321 | name) 322 | continue 323 | save_single_account_data( 324 | account=a, output_dir=os.path.join(output_dir, name), **kwargs) 325 | 326 | 327 | def connect(params: dict) -> ofxclient.institution.Institution: 328 | """Connects to an OFX server. 329 | 330 | :param params: A dict containing the following string fields: 331 | 332 | - id: FI Id (see ofxhome.com) 333 | 334 | - org: FI Org (see ofxhome.com) 335 | 336 | - url: FI Url (see ofxhome.com) 337 | 338 | - broker_id: Optional. FI Broker Id (see ofxhome.com) 339 | 340 | - username: Your username 341 | 342 | - password: Your password 343 | 344 | :returns: A connected ofxclient.institution.Institution object. 345 | """ 346 | inst = ofxclient.institution.Institution(**params) 347 | inst.authenticate() 348 | return inst 349 | 350 | 351 | def run(ofx_params, output_directory, headless=False, **kwargs): 352 | """Download non-interactively.""" 353 | del headless 354 | inst = connect(ofx_params) 355 | save_all_account_data(inst, output_directory, **kwargs) 356 | 357 | 358 | @contextlib.contextmanager 359 | def interactive(ofx_params, output_directory, headless=False): 360 | """Returns variables for interactive session.""" 361 | del headless 362 | yield dict( 363 | ofx_params=ofx_params, 364 | output_directory=output_directory, 365 | inst=connect(ofx_params), 366 | ) 367 | -------------------------------------------------------------------------------- /finance_dl/scrape_lib.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import time 4 | import tempfile 5 | import shutil 6 | import seleniumrequests 7 | 8 | from selenium import webdriver 9 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 10 | from selenium.webdriver.support.ui import WebDriverWait, Select 11 | from selenium.webdriver.support import expected_conditions 12 | import signal 13 | 14 | from selenium.webdriver.remote.webdriver import WebDriver 15 | 16 | from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException 17 | from selenium.webdriver.common.by import By 18 | from selenium.webdriver.common.keys import Keys 19 | 20 | 21 | def all_conditions(*conditions): 22 | return lambda driver: all(condition(driver) for condition in conditions) 23 | 24 | 25 | def extract_table_data(table, header_names, single_header=False): 26 | rows = table.find_elements_by_xpath('thead/tr | tbody/tr | tr') 27 | headers = [] 28 | seen_data = False 29 | data = [] 30 | for row in rows: 31 | cell_elements = row.find_elements_by_xpath('th | td') 32 | cell_values = [x.text.strip() for x in cell_elements] 33 | is_header_values = [x in header_names for x in cell_values if x] 34 | if len(is_header_values) == 0: 35 | is_header = True 36 | else: 37 | if any(is_header_values) != all(is_header_values): 38 | raise RuntimeError('Header mismatch: %r' % (list( 39 | zip(is_header_values, 40 | [x for x in cell_values if x]), 41 | ))) 42 | is_header = any(is_header_values) 43 | if is_header and (not seen_data or not single_header): 44 | if seen_data: 45 | headers.clear() 46 | cur_header = dict() 47 | headers.append(cur_header) 48 | cur_col = 0 49 | for text, el in zip(cell_values, cell_elements): 50 | colspan = el.get_attribute('colspan') 51 | if colspan is None: 52 | colspan = 1 53 | else: 54 | colspan = int(colspan) 55 | for span in range(colspan): 56 | if text: 57 | cur_header[cur_col] = text 58 | cur_col += 1 59 | else: 60 | seen_data = True 61 | cur_col = 0 62 | cur_data = [] 63 | for text, el in zip(cell_values, cell_elements): 64 | colspan = el.get_attribute('colspan') 65 | if colspan is None: 66 | colspan = 1 67 | else: 68 | colspan = int(colspan) 69 | header_parts = [] 70 | for span in range(colspan): 71 | for header in headers: 72 | part = header.get(cur_col) 73 | if part is not None: 74 | header_parts.append(part) 75 | cur_col += 1 76 | if text: 77 | cur_data.append((':'.join(header_parts), text)) 78 | if cur_data: 79 | data.append(cur_data) 80 | return data 81 | 82 | 83 | def find_table_by_headers(scraper, headers): 84 | tables = None 85 | for header in headers: 86 | new_tables = scraper.find_visible_elements_by_descendant_partial_text( 87 | header, 'table') 88 | if tables is None: 89 | tables = set(new_tables) 90 | else: 91 | tables &= set(new_tables) 92 | return tables 93 | 94 | 95 | # https://stackoverflow.com/questions/8344776/can-selenium-interact-with-an-existing-browser-session 96 | def attach_to_session(executor_url, session_id): 97 | original_execute = WebDriver.execute 98 | 99 | def new_command_execute(self, command, params=None): 100 | if command == "newSession": 101 | # Mock the response 102 | return {'success': 0, 'value': None, 'sessionId': session_id} 103 | else: 104 | return original_execute(self, command, params) 105 | 106 | # Patch the function before creating the driver object 107 | WebDriver.execute = new_command_execute 108 | driver = webdriver.Remote(command_executor=executor_url, 109 | desired_capabilities={}) 110 | driver.session_id = session_id 111 | # Replace the patched function with original function 112 | WebDriver.execute = original_execute 113 | return driver 114 | 115 | 116 | def is_displayed(element): 117 | """Returns `True` if `element` is displayed. 118 | 119 | Ignores StaleElementReferenceException. 120 | """ 121 | 122 | try: 123 | return element.is_displayed() 124 | except StaleElementReferenceException: 125 | return False 126 | 127 | 128 | class Scraper(object): 129 | def __init__(self, download_dir=None, connect=None, chromedriver_bin='finance-dl-chromedriver-wrapper', 130 | headless=True, use_seleniumrequests=False, session_id=None, profile_dir=None): 131 | 132 | self.download_dir = download_dir 133 | 134 | if connect is not None and session_id is not None: 135 | print('Connecting to existing browser: %s %s' % (connect, 136 | session_id)) 137 | self.driver = attach_to_session(connect, session_id) 138 | return 139 | 140 | original_sigint_handler = signal.getsignal(signal.SIGINT) 141 | signal.signal(signal.SIGINT, signal.SIG_IGN) 142 | 143 | self.chromedriver_bin = chromedriver_bin 144 | chrome_options = webdriver.ChromeOptions() 145 | service_args = [] 146 | chrome_options.add_experimental_option('excludeSwitches', [ 147 | 'enable-automation', 148 | 'load-extension', 149 | 'load-component-extension', 150 | 'ignore-certificate-errors', 151 | 'test-type', 152 | ]) 153 | if profile_dir is not None: 154 | chrome_options.add_argument('user-data-dir=%s' % profile_dir) 155 | if not os.path.exists(profile_dir): 156 | os.makedirs(profile_dir) 157 | prefs = {} 158 | prefs['plugins.plugins_disabled'] = [ 159 | 'Chrome PDF Viewer', 'Chromium PDF Viewer' 160 | ] 161 | prefs['plugins.always_open_pdf_externally'] = True 162 | if download_dir is not None: 163 | prefs['download.default_directory'] = download_dir 164 | chrome_options.add_experimental_option('prefs', prefs) 165 | if headless: 166 | chrome_options.add_argument('headless') 167 | if use_seleniumrequests: 168 | driver_class = seleniumrequests.Chrome 169 | else: 170 | driver_class = webdriver.Chrome 171 | self.driver = driver_class( 172 | executable_path=self.chromedriver_bin, 173 | chrome_options=chrome_options, 174 | service_args=service_args, 175 | ) 176 | print(' --connect=%s --session-id=%s' % 177 | (self.driver.command_executor._url, self.driver.session_id)) 178 | signal.signal(signal.SIGINT, original_sigint_handler) 179 | 180 | def check_after_wait(self): 181 | """Function called after each wait.""" 182 | pass 183 | 184 | def get_downloaded_file(self): 185 | names = os.listdir(self.download_dir) 186 | partial_names = [] 187 | other_names = [] 188 | for name in names: 189 | if name.endswith('.part') or name.endswith('.crdownload'): 190 | partial_names.append(name) 191 | else: 192 | other_names.append(name) 193 | if len(other_names) == 0: 194 | return None 195 | if len(other_names) > 1: 196 | raise RuntimeError( 197 | 'More than one downloaded file: %r' % other_names) 198 | # if len(partial_names) > 0: 199 | # raise RuntimeError('Partial download files remain: %r' % partial_names) 200 | path = os.path.join(self.download_dir, other_names[0]) 201 | with open(path, 'rb') as f: 202 | data = f.read() 203 | if len(data) == 0: 204 | return None 205 | os.remove(path) 206 | return other_names[0], data 207 | 208 | # See http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html 209 | @contextlib.contextmanager 210 | def wait_for_page_load(self, timeout=30): 211 | old_page = self.driver.find_element_by_tag_name('html') 212 | yield 213 | WebDriverWait(self.driver, timeout).until( 214 | expected_conditions.staleness_of(old_page), 215 | message='waiting for page to load') 216 | self.check_after_wait() 217 | 218 | @contextlib.contextmanager 219 | def wait_for_new_url(self, timeout=30): 220 | old_url = self.driver.current_url 221 | yield 222 | 223 | def is_new_url(): 224 | if self.driver.current_url != old_url: 225 | return True 226 | raise NoSuchElementException 227 | 228 | self.wait_and_return(is_new_url) 229 | 230 | def wait_and_return(self, *conditions, timeout=30, 231 | message='Waiting to match conditions'): 232 | results = [None] 233 | 234 | def predicate(driver): 235 | results[0] = tuple(condition() for condition in conditions) 236 | return all(results[0]) 237 | 238 | WebDriverWait(self.driver, timeout).until(predicate, message=message) 239 | self.check_after_wait() 240 | return results[0] 241 | 242 | def wait_and_locate(self, *locators, timeout=30, only_displayed=False): 243 | conditions = [] 244 | for locator in locators: 245 | 246 | def condition(locator=locator): 247 | element = self.driver.find_element(*locator) 248 | if only_displayed: 249 | if not is_displayed(element): 250 | raise NoSuchElementException 251 | return element 252 | 253 | conditions.append(condition) 254 | return self.wait_and_return( 255 | *conditions, timeout=timeout, 256 | message='Waiting to locate %r' % (locators, )) 257 | 258 | def for_each_frame(self): 259 | self.driver.switch_to.default_content() 260 | 261 | seen_ids = set() 262 | def helper(nesting_level=0): 263 | def handle_frames(frames): 264 | frames = [f for f in frames if f.id not in seen_ids] 265 | seen_ids.update(f.id for f in frames) 266 | for frame in frames: 267 | self.driver.switch_to.frame(frame) 268 | yield from helper(nesting_level=nesting_level + 1) 269 | self.driver.switch_to.parent_frame() 270 | yield 271 | for element_name in ['frame', 'iframe']: 272 | try: 273 | other_frames = self.find_visible_elements( 274 | By.TAG_NAME, element_name) 275 | yield from handle_frames(other_frames) 276 | except: 277 | pass 278 | 279 | yield from helper() 280 | 281 | def find_elements_in_any_frame(self, by_method, locator, predicate=None, 282 | only_displayed=False): 283 | for frame in self.for_each_frame(): 284 | try: 285 | for element in self.driver.find_elements(by_method, locator): 286 | if only_displayed: 287 | try: 288 | if not is_displayed(element): 289 | continue 290 | except: 291 | import traceback 292 | traceback.print_exc() 293 | continue 294 | if predicate is None or predicate(element): 295 | yield element 296 | except NoSuchElementException: 297 | pass 298 | 299 | def find_element_in_any_frame(self, by_method, locator, **kwargs): 300 | for element in self.find_elements_in_any_frame(by_method, locator, 301 | **kwargs): 302 | return element 303 | raise NoSuchElementException 304 | 305 | def interact(self, global_vars, local_vars): 306 | import IPython 307 | # ipshell = InteractiveShellEmbed(banner1='', exit_msg='') 308 | # ipshell.extension_manager.load_extension('autoreload') 309 | # ipshell.run_line_magic('autoreload', '2') 310 | # ipshell.autoindent = False 311 | ns = global_vars.copy() 312 | ns.update(local_vars) 313 | ns['self'] = self 314 | IPython.terminal.ipapp.launch_new_instance(argv=[], user_ns=ns) 315 | # ipshell(local_ns=ns) 316 | # vars = global_vars.copy() 317 | # vars.update(local_vars) 318 | # shell = code.InteractiveConsole(vars) 319 | # shell.interact() 320 | 321 | def find_username_and_password(self): 322 | passwords = self.driver.find_elements(By.XPATH, 323 | '//input[@type="password"]') 324 | passwords = [x for x in passwords if is_displayed(x)] 325 | if len(passwords) == 0: 326 | raise NoSuchElementException() 327 | password = passwords[0] 328 | username = password.find_elements( 329 | By.XPATH, 'preceding::input[@type="text" or @type="email"]')[-1] 330 | if not is_displayed(username): 331 | raise NoSuchElementException() 332 | return username, password 333 | 334 | def find_username_and_password_in_any_frame(self): 335 | for frame in self.for_each_frame(): 336 | try: 337 | return self.find_username_and_password() 338 | except NoSuchElementException: 339 | pass 340 | raise NoSuchElementException() 341 | 342 | def find_visible_elements_by_descendant_partial_text( 343 | self, text, element_name): 344 | return self.find_elements_by_descendant_partial_text( 345 | text, element_name, only_displayed=True) 346 | 347 | def find_elements_by_descendant_partial_text(self, text, element_name, 348 | only_displayed=False): 349 | all_elements = self.driver.find_elements_by_xpath( 350 | "//text()[contains(.,%r)]/ancestor::*[self::%s][1]" % 351 | (text, element_name)) 352 | if only_displayed: 353 | return [x for x in all_elements if is_displayed(x)] 354 | return all_elements 355 | 356 | def find_elements_by_descendant_text_match(self, text_match, element_name, 357 | only_displayed=False): 358 | all_elements = self.driver.find_elements_by_xpath( 359 | "//text()[%s]/ancestor::*[self::%s][1]" % (text_match, 360 | element_name)) 361 | if only_displayed: 362 | return [x for x in all_elements if is_displayed(x)] 363 | return all_elements 364 | 365 | def find_visible_elements_by_partial_text(self, text, element_name): 366 | all_elements = self.driver.find_elements_by_xpath( 367 | "//%s[contains(.,%r)]" % (element_name, text)) 368 | return [x for x in all_elements if is_displayed(x)] 369 | 370 | def find_visible_elements(self, by_method, locator): 371 | elements = self.driver.find_elements(by_method, locator) 372 | return [x for x in elements if is_displayed(x)] 373 | 374 | def click(self, link): 375 | self.driver.execute_script('arguments[0].scrollIntoView(true);', link) 376 | link.click() 377 | 378 | 379 | @contextlib.contextmanager 380 | def temp_scraper(scraper_type, *args, headless=True, connect=None, 381 | session_id=None, **kwargs): 382 | download_dir = tempfile.mkdtemp() 383 | try: 384 | scraper = scraper_type(*args, download_dir=download_dir, 385 | connect=connect, session_id=session_id, 386 | headless=headless, **kwargs) 387 | try: 388 | yield scraper 389 | finally: 390 | if connect is None: 391 | try: 392 | scraper.driver.quit() 393 | except Exception as e: 394 | print('Error quitting driver: %r' % e) 395 | finally: 396 | shutil.rmtree(download_dir) 397 | 398 | 399 | def retry(func, num_tries=3, retry_delay=0): 400 | while True: 401 | try: 402 | return func() 403 | except Exception as e: 404 | import traceback 405 | traceback.print_exc() 406 | num_tries -= 1 407 | if num_tries <= 0: 408 | raise 409 | print('Waiting %g seconds before retrying' % (retry_delay, )) 410 | time.sleep(retry_delay) 411 | 412 | 413 | def run_with_scraper(scraper_class, **kwargs): 414 | first_call = True 415 | 416 | def fetch(): 417 | nonlocal first_call 418 | if not first_call: 419 | kwargs['headless'] = False 420 | first_call = False 421 | with temp_scraper(scraper_class, **kwargs) as scraper: 422 | scraper.run() 423 | 424 | retry(fetch) 425 | 426 | 427 | @contextlib.contextmanager 428 | def interact_with_scraper(scraper_class, **kwargs): 429 | with temp_scraper(scraper_class, **kwargs) as scraper: 430 | yield dict( 431 | scraper=scraper, 432 | self=scraper, 433 | By=By, 434 | Select=Select, 435 | Keys=Keys, 436 | ) 437 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /finance_dl/mint.py: -------------------------------------------------------------------------------- 1 | """Downloads Mint.com transactions and balance data. 2 | 3 | This uses the `mintapi` Python package in conjunction with the `selenium` Python 4 | package and `chromedriver` to scrape the Mint.com website. 5 | 6 | Configuration: 7 | ============== 8 | 9 | The following keys may be specified as part of the configuration dict: 10 | 11 | - `credentials`: Required. Must be a `dict` with `'username'` and `'password'` 12 | keys. 13 | 14 | - `output_directory`: Required. Must be a `str` that specifies the path on the 15 | local filesystem where the output will be written. If the directory does not 16 | exist, it will be created. 17 | 18 | - `profile_dir`: Optional. If specified, must be a `str` that specifies the 19 | path to a persistent Chrome browser profile to use. This should be a path 20 | used solely for this single configuration; it should not refer to your normal 21 | browser profile. If not specified, a fresh temporary profile will be used 22 | each time. It is highly recommended to specify a `profile_dir` to avoid 23 | having to manually enter a multi-factor authentication code each time. 24 | 25 | - `merge_files`: Optional. If specified, must be a list of `str` values that 26 | specify the paths to additional CSV files containing transactions in the same 27 | format as the `mint.csv` output file. These files are merged with the 28 | contents of `mint.csv` into a new file `mint-merged.csv` in the specified 29 | `output_directory`. 30 | 31 | - `skip_refresh`: Optional. Defaults to `False`. A value of `True` indicates 32 | not to wait until all account data has been refreshed. 33 | 34 | Output format: 35 | ============== 36 | 37 | The transactions are saved to a file named `mint.csv` under the specified output 38 | directory. Balance information is saved to files named 39 | `balances.%Y-%m-%dT%H%M%S%z.csv` under the specified output directory. 40 | 41 | Duplicate transactions are excluded from the merged file, in the following way: 42 | since the Mint CSV format lacks any sort of unique transaction identifier, 43 | multiple legitimate transactions may produce identical lines in the CSV file. 44 | Therefore, for each unique CSV line, considering only the 'Date', 'Original 45 | Description', 'Amount', 'Transaction Type', and 'Account Name' fields, the 46 | merged file contains N copies of this line, where N is the maximum number of 47 | times this line occurs in any of the input CSV files. 48 | 49 | Example: 50 | ======== 51 | 52 | def CONFIG_mint(): 53 | return dict( 54 | module='finance_dl.mint', 55 | credentials={ 56 | 'username': 'XXXXXX', 57 | 'password': 'XXXXXX', 58 | }, 59 | output_directory=os.path.join(data_dir, 'mint'), 60 | # profile_dir is optional, but highly recommended to avoid having to 61 | # enter multi-factor authentication code each time. 62 | profile_dir=os.path.join(profile_dir, 'mint'), 63 | ) 64 | 65 | Interactive shell: 66 | ================== 67 | 68 | From the interactive shell, type: 69 | 70 | run(output_directory=output_directory, profile_dir=profile_dir, 71 | credentials=credentials) 72 | 73 | to run the scraper. 74 | 75 | """ 76 | 77 | import os 78 | from typing import Sequence, Optional, Dict 79 | import dateutil.parser 80 | import io 81 | import csv 82 | import re 83 | import contextlib 84 | import collections 85 | import urllib.parse 86 | import datetime 87 | import time 88 | import json 89 | import logging 90 | import traceback 91 | from selenium.webdriver.common.by import By 92 | from selenium.webdriver.common.keys import Keys 93 | import selenium.common.exceptions 94 | 95 | from . import csv_merge 96 | from . import scrape_lib 97 | 98 | if False: 99 | from mintapi import Mint # for typing only 100 | 101 | logger = logging.getLogger('mint') 102 | 103 | netloc_re = r'^([^\.@]+\.)*(mint|intuit).com$' 104 | 105 | 106 | def check_url(url): 107 | result = urllib.parse.urlparse(url) 108 | if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc): 109 | raise RuntimeError('Reached invalid URL: %r' % url) 110 | 111 | 112 | class MintTokenScraper(scrape_lib.Scraper): 113 | def __init__(self, credentials, login_timeout=30, **kwargs): 114 | super().__init__(use_seleniumrequests=True, **kwargs) 115 | self.credentials = credentials 116 | self.login_timeout = login_timeout 117 | 118 | def login(self): 119 | logger.info('Logging into mint') 120 | self.driver.get( 121 | "https://accounts.intuit.com/index.html?offering_id=Intuit.ifs.mint&namespace_id=50000026&redirect_url=https://mint.intuit.com/overview.event" 122 | ) 123 | logger.info('Waiting to enter username and password') 124 | (username, password), = self.wait_and_return( 125 | self.find_username_and_password_in_any_frame) 126 | logger.info('Entering username and password') 127 | username.send_keys(self.credentials['username']) 128 | password.send_keys(self.credentials['password']) 129 | password.send_keys(Keys.ENTER) 130 | start_time = time.time() 131 | while not self.driver.current_url.startswith( 132 | 'https://mint.intuit.com/overview.event'): 133 | logger.info('Waiting for MFA') 134 | time.sleep(1) 135 | cur_time = time.time() 136 | if self.login_timeout is not None and cur_time > start_time + self.login_timeout: 137 | raise TimeoutError('Login failed to complete within timeout') 138 | 139 | while True: 140 | token_element, = self.wait_and_locate((By.NAME, 'javascript-user')) 141 | value_json = token_element.get_attribute('value') 142 | logger.info('scraped user data: %r', value_json) 143 | try: 144 | value = json.loads(value_json) 145 | if isinstance(value, dict) and 'token' in value: 146 | break 147 | except ValueError: 148 | pass 149 | logger.info('Waiting for token') 150 | time.sleep(1) 151 | cur_time = time.time() 152 | if self.login_timeout is not None and cur_time > start_time + self.login_timeout: 153 | raise TimeoutError('Login failed to complete within timeout') 154 | 155 | 156 | @contextlib.contextmanager 157 | def connect(credentials, scraper_args=None): 158 | import mintapi 159 | mint = mintapi.Mint() 160 | scraper_args = dict(scraper_args or {}) 161 | 162 | def try_login(scraper): 163 | scraper = MintTokenScraper(credentials=credentials, **scraper_args) 164 | scraper.login() 165 | mint.driver = scraper.driver 166 | mint.token = mint.get_token() 167 | 168 | with scrape_lib.temp_scraper(MintTokenScraper, credentials=credentials, 169 | **scraper_args) as scraper: 170 | okay = False 171 | try: 172 | try_login(scraper) 173 | okay = True 174 | except (TimeoutError, selenium.common.exceptions.TimeoutException): 175 | if not scraper_args.get('headless') and not scraper_args.get( 176 | 'login_timeout'): 177 | raise 178 | traceback.print_exc() 179 | if okay: 180 | yield mint 181 | return 182 | scraper_args['headless'] = True 183 | scraper_args['login_timeout'] = None 184 | logger.info('Retrying login interactively') 185 | with scrape_lib.temp_scraper(MintTokenScraper, credentials=credentials, 186 | **scraper_args) as scraper: 187 | try_login(scraper) 188 | yield mint 189 | 190 | 191 | def match_csv_to_json(csv_entry: dict, json_entry: dict): 192 | json_date = dateutil.parser.parse(json_entry['date']) 193 | json_csv_entry = { 194 | 'Date': 195 | '%d/%02d/%d' % (json_date.month, json_date.day, json_date.year), 196 | 'Original Description': 197 | json_entry['omerchant'], 198 | 'Amount': 199 | json_entry['amount'].translate({ 200 | ord('$'): None, 201 | ord(','): None 202 | }), 203 | 'Transaction Type': 204 | 'debit' if json_entry['isDebit'] else 'credit', 205 | 'Account Name': 206 | json_entry['account'], 207 | } 208 | csv_entry = csv_entry.copy() 209 | csv_entry.pop('Category', None) 210 | csv_entry.pop('Description', None) 211 | csv_entry.pop('Labels', None) 212 | csv_entry.pop('Notes', None) 213 | if csv_entry != json_csv_entry: 214 | raise RuntimeError('CSV entry %r does not match JSON entry %r' % 215 | (csv_entry, json_csv_entry)) 216 | 217 | 218 | def get_annotated_transactions(mint: 'Mint', num_attempts: int = 3): 219 | for attempt_num in range(num_attempts): 220 | try: 221 | logger.info('Getting CSV transactions') 222 | csv_data = mint.get_transactions_csv( 223 | include_investment=True).decode() 224 | if len(csv_data) == 0: 225 | raise RuntimeError('Received empty Mint data') 226 | 227 | logger.info('Getting JSON transactions') 228 | json_data = mint.get_transactions_json(include_investment=True) 229 | 230 | reader = csv.DictReader(io.StringIO(csv_data, newline='')) 231 | csv_rows = list(reader) 232 | 233 | if len(csv_rows) != len(json_data): 234 | raise RuntimeError('CSV data does not match JSON data') 235 | 236 | for csv_entry, json_entry in zip(csv_rows, json_data): 237 | match_csv_to_json(csv_entry, json_entry) 238 | break 239 | except: 240 | if attempt_num + 1 == num_attempts: 241 | raise 242 | return (reader.fieldnames, list(zip(csv_rows, json_data))) 243 | 244 | 245 | def refresh_mint_data(mint: 'Mint'): 246 | logger.info('Initiating account refresh') 247 | mint.initiate_account_refresh() 248 | # Wait for downloading to be complete 249 | logger.info('Waiting for accounts to update') 250 | polling_interval_seconds = 5 251 | start_time = time.time() 252 | while True: 253 | time.sleep(polling_interval_seconds) 254 | accounts = mint.get_accounts() 255 | cur_time = time.time() 256 | pending = [] 257 | ok = [] 258 | other = [] 259 | for account in accounts: 260 | status = account['fiLoginStatus'] 261 | if status in ['DOWNLOADING_IN_PROGRESS', 'REFRESH_REQUESTED']: 262 | pending.append(account) 263 | elif status == 'OK': 264 | ok.append(account) 265 | else: 266 | other.append(account) 267 | if len(pending) == 0: 268 | break 269 | logger.info('[%d seconds] Still downloading: %s', 270 | cur_time - start_time, ' '.join( 271 | '%r' % account['name'] for account in pending)) 272 | cur_time = time.time() 273 | logger.info('[%d seconds] Finished updating' % (cur_time - start_time)) 274 | for account in other: 275 | logger.info('Account %r in state %r', account['name'], 276 | account['fiLoginStatus']) 277 | 278 | 279 | mint_date_format = '%m/%d/%Y' 280 | 281 | 282 | def get_mint_date(row: dict): 283 | date = datetime.datetime.strptime(row['Date'], mint_date_format).date() 284 | return date 285 | 286 | 287 | def download_mint_data(mint: 'Mint'): 288 | fieldnames, entries = get_annotated_transactions(mint) 289 | non_pending_txns = [ 290 | csv_row for csv_row, json_row in entries 291 | if not json_row['isPending'] and not json_row['isDuplicate'] 292 | ] 293 | 294 | balances = [] 295 | account_max_transaction_date = dict() # type: Dict[str, datetime.date] 296 | for csv_row in non_pending_txns: 297 | date = get_mint_date(csv_row) 298 | account = csv_row['Account Name'] 299 | prev_date = account_max_transaction_date.get(account) 300 | if prev_date is None or prev_date < date: 301 | account_max_transaction_date[account] = date 302 | 303 | account_data = mint.get_accounts() 304 | for account in account_data: 305 | account_name = account['name'] 306 | max_date = account_max_transaction_date.get(account_name) 307 | max_date_str = (max_date.strftime(mint_date_format) 308 | if max_date is not None else '') 309 | balance = account.get('currentBalance', '') 310 | if account['accountType'] == 'credit': 311 | # Mint negates credit card balances. 312 | balance = -balance 313 | balances.append({ 314 | 'Name': account_name, 315 | 'Currency': account.get('currency', ''), 316 | 'Balance': str(balance), 317 | 'Last Updated': str(account.get('lastUpdated', '')), 318 | 'State': account.get('fiLoginStatus', ''), 319 | 'Last Transaction': max_date_str, 320 | }) 321 | 322 | new_csv = io.StringIO(newline='') 323 | new_csv_data = csv.DictWriter(new_csv, fieldnames=fieldnames, 324 | lineterminator='\n', quoting=csv.QUOTE_ALL) 325 | new_csv_data.writeheader() 326 | new_csv_data.writerows(non_pending_txns) 327 | 328 | csv_data = new_csv.getvalue() 329 | return csv_data, balances 330 | 331 | 332 | def merge_mint_data(mint_data_list: Sequence[str]): 333 | fieldnames = None 334 | merged_counter = collections.Counter() # type: Dict[tuple, int] 335 | merged_rows = [] 336 | keep_fields = [ 337 | 'Date', 'Original Description', 'Amount', 'Transaction Type', 338 | 'Account Name' 339 | ] 340 | 341 | def convert_row(row) -> tuple: 342 | return tuple(row[field] for field in keep_fields) 343 | 344 | for csv_data in mint_data_list: 345 | cur_counter = collections.Counter() # type: Dict[tuple, int] 346 | reader = csv.DictReader(io.StringIO(csv_data, newline='')) 347 | if fieldnames is None: 348 | fieldnames = reader.fieldnames 349 | else: 350 | assert fieldnames == reader.fieldnames, (fieldnames, 351 | reader.fieldnames) 352 | rows = list(reader) 353 | for row in rows: 354 | converted_row = convert_row(row) 355 | cur_counter[converted_row] += 1 356 | if cur_counter[converted_row] > merged_counter[converted_row]: 357 | merged_rows.append(row) 358 | merged_counter[converted_row] += 1 359 | 360 | merged_rows.sort(key=get_mint_date, reverse=True) 361 | 362 | assert fieldnames is not None 363 | 364 | new_csv = io.StringIO(newline='') 365 | new_csv_data = csv.DictWriter(new_csv, fieldnames=fieldnames, 366 | lineterminator='\n', quoting=csv.QUOTE_ALL) 367 | new_csv_data.writeheader() 368 | new_csv_data.writerows(merged_rows) 369 | 370 | csv_data = new_csv.getvalue() 371 | return csv_data 372 | 373 | 374 | def merge_mint_files(input_paths: Sequence[str], output_path: str): 375 | mint_data_list = [] 376 | for filename in input_paths: 377 | with open(filename, 'r', encoding='utf-8', newline='') as f: 378 | mint_data_list.append(f.read()) 379 | csv_data = merge_mint_data(mint_data_list) 380 | with open(output_path, 'w', encoding='utf-8', newline='') as f: 381 | f.write(csv_data) 382 | 383 | 384 | def verify_mint_update_consistency(csv_data: str, existing_filename: str, 385 | allow_missing: bool = False): 386 | unchanged = False 387 | 388 | if os.path.exists(existing_filename): 389 | missing = False 390 | with open(existing_filename, 'r', encoding='utf-8', newline='') as f: 391 | old_data = f.read() 392 | 393 | def get_rows(data): 394 | reader = csv.DictReader(io.StringIO(csv_data, newline='')) 395 | csv_rows = list(reader) 396 | keep_fields = [ 397 | 'Date', 'Original Description', 'Amount', 'Transaction Type', 398 | 'Account Name' 399 | ] 400 | 401 | def convert_row(row): 402 | return tuple(row[field] for field in keep_fields) 403 | 404 | return list(map(convert_row, csv_rows)) 405 | 406 | if old_data == csv_data: 407 | unchanged = True 408 | else: 409 | old_rows = get_rows(old_data) 410 | old_counter = collections.Counter(old_rows) 411 | new_rows = get_rows(csv_data) 412 | new_counter = collections.Counter(new_rows) 413 | 414 | for k in old_rows: 415 | if old_counter[k] > new_counter[k]: 416 | logger.warning('New file missing entry: %s', k) 417 | missing = True 418 | if missing and not allow_missing: 419 | raise RuntimeError('New file is missing some existing entries') 420 | if not unchanged: 421 | with open(existing_filename, 'w', encoding='utf-8', newline='') as f: 422 | f.write(csv_data) 423 | 424 | 425 | def fetch_mint_data(credentials: dict, existing_filename: str, 426 | new_filename: str, balances_output_prefix: str, 427 | skip_refresh: bool = False, skip_download: bool = False, 428 | allow_missing: bool = False, **kwargs): 429 | if new_filename == existing_filename: 430 | raise ValueError('new_filename must not equal existing_filename') 431 | if skip_download: 432 | with open(new_filename, 'r', encoding='utf-8', newline='') as f: 433 | csv_data = f.read() 434 | else: 435 | with connect(credentials, kwargs) as mint: 436 | if not skip_refresh: 437 | refresh_mint_data(mint) 438 | csv_data, balances = download_mint_data(mint) 439 | with open(new_filename, 'w', encoding='utf-8', newline='') as f: 440 | f.write(csv_data) 441 | 442 | balances_path = balances_output_prefix + time.strftime( 443 | '%Y-%m-%dT%H%M%S%z') + '.csv' 444 | csv_merge.write_csv([ 445 | 'Name', 'Currency', 'Balance', 'Last Updated', 'State', 446 | 'Last Transaction' 447 | ], balances, balances_path) 448 | logger.info('Writing balances to: %s', balances_path) 449 | 450 | verify_mint_update_consistency(csv_data=csv_data, 451 | existing_filename=existing_filename, 452 | allow_missing=allow_missing) 453 | 454 | 455 | def run(output_directory: str, merge_files: Sequence[str] = (), **kwargs): 456 | if not os.path.exists(output_directory): 457 | os.makedirs(output_directory) 458 | existing_filename = os.path.join(output_directory, 'mint.csv') 459 | new_filename = os.path.join(output_directory, 'mint.csv.new') 460 | balances_output_prefix = os.path.join(output_directory, 'balances.') 461 | fetch_mint_data(existing_filename=existing_filename, 462 | new_filename=new_filename, 463 | balances_output_prefix=balances_output_prefix, **kwargs) 464 | if merge_files: 465 | merged_filename = os.path.join(output_directory, 'mint-merged.csv') 466 | merge_mint_files([existing_filename] + list(merge_files), 467 | merged_filename) 468 | logger.info('Saved merged transactions to: %s', merged_filename) 469 | 470 | 471 | @contextlib.contextmanager 472 | def interactive(**kwargs): 473 | with connect(kwargs['credentials'], 474 | dict(profile_dir=kwargs.get('profile_dir'))) as mint: 475 | kwargs['mint'] = mint 476 | yield kwargs 477 | --------------------------------------------------------------------------------