├── web ├── reports │ ├── __init__.py │ ├── tests.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ └── load.py │ ├── migrations │ │ ├── __init__.py │ │ ├── 0002_auto_20150713_0347.py │ │ └── 0001_initial.py │ ├── templatetags │ │ ├── __init__.py │ │ └── filters.py │ ├── urls.py │ ├── util.py │ ├── templates │ │ ├── base.html │ │ ├── bank.html │ │ └── index.html │ ├── views.py │ └── models.py ├── settings │ ├── __init__.py │ ├── dev.py │ └── prod.py ├── manage.py └── wsgi.py ├── deploy └── fdic-call-reports.conf ├── requirements.txt ├── LICENSE ├── bin ├── mrjob-transform.py ├── download_all_reports.py └── extract.py └── README.md /web/reports/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/tests.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/settings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/templatetags/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/reports/templatetags/filters.py: -------------------------------------------------------------------------------- 1 | from django import template 2 | 3 | 4 | 5 | register = template.Library() 6 | 7 | 8 | 9 | @register.filter 10 | def money(value, scale): 11 | return '${:,d}'.format(value / scale) 12 | -------------------------------------------------------------------------------- /web/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.dev") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /web/reports/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, include, url 2 | from reports.views import BankView, IndexView 3 | 4 | 5 | 6 | urlpatterns = patterns('', 7 | url(r'^$', IndexView.as_view(), name='index'), 8 | url(r'^bank/(?P\d+)$', BankView.as_view(), name='bank'), 9 | ) 10 | -------------------------------------------------------------------------------- /deploy/fdic-call-reports.conf: -------------------------------------------------------------------------------- 1 | WSGIScriptAlias / /opt/repos/fdic-call-reports/web/wsgi.py 2 | WSGIPythonPath /opt/repos/fdic-call-reports/web:/opt/repos/fdic-call-reports/lib/python2.7/site-packages 3 | 4 | 5 | Order deny,allow 6 | 7 | 8 | -------------------------------------------------------------------------------- /web/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for web project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.7/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.prod") 12 | 13 | from django.core.wsgi import get_wsgi_application 14 | application = get_wsgi_application() 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.7 2 | MySQL-python==1.2.5 3 | PyYAML==3.11 4 | awscli==1.4.4 5 | bcdoc==0.12.2 6 | boto==2.32.1 7 | botocore==0.63.0 8 | colorama==0.2.5 9 | django-debug-toolbar==1.3.2 10 | docutils==0.12 11 | filechunkio==1.6 12 | jmespath==0.4.1 13 | mrjob==0.4.4 14 | mysql==0.0.1 15 | pyasn1==0.1.7 16 | python-dateutil==2.2 17 | python-memcached==1.53 18 | rsa==3.1.2 19 | selenium==2.43.0 20 | simplejson==3.7.3 21 | six==1.8.0 22 | sqlparse==0.1.15 23 | wsgiref==0.1.2 24 | -------------------------------------------------------------------------------- /web/reports/migrations/0002_auto_20150713_0347.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('reports', '0001_initial'), 11 | ] 12 | 13 | operations = [ 14 | migrations.RemoveField( 15 | model_name='report', 16 | name='bank', 17 | ), 18 | migrations.DeleteModel( 19 | name='Bank', 20 | ), 21 | migrations.AddField( 22 | model_name='report', 23 | name='idrssd', 24 | field=models.PositiveIntegerField(default=0), 25 | preserve_default=False, 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /web/reports/util.py: -------------------------------------------------------------------------------- 1 | def model_unicode(instance, fields): 2 | vals = [getattr(instance, x) for x in fields] 3 | 4 | def force_unicode(s): 5 | if isinstance(s, str): 6 | return s.decode('utf-8') 7 | return unicode(s) 8 | 9 | return u'{}({})'.format( 10 | instance.__class__.__name__, 11 | u', '.join([force_unicode(v) for v in vals]) 12 | ) 13 | 14 | 15 | 16 | class IndexableQuery(object): 17 | def __init__(self, count, query): 18 | self.count = count 19 | self.query = query 20 | 21 | 22 | def __getitem__(self, i): 23 | return self[i:i+1] 24 | 25 | 26 | def __getslice__(self, i, j): 27 | return self.query(i, j - i + 1) 28 | 29 | 30 | def __len__(self): 31 | return self.count 32 | -------------------------------------------------------------------------------- /web/reports/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {% block title %}{% endblock %} 8 | 9 | 10 | {% block head %}{% endblock %} 11 | 12 | 13 | {% block body %}{% endblock %} 14 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2015 Andy Chosak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /web/reports/views.py: -------------------------------------------------------------------------------- 1 | from django.http import Http404 2 | from django.views.generic import ListView 3 | 4 | from reports.models import Report 5 | from reports.util import IndexableQuery 6 | 7 | 8 | 9 | class IndexView(ListView): 10 | template_name = 'index.html' 11 | context_object_name = 'banks' 12 | paginate_by = 100 13 | 14 | 15 | def __init__(self, *args, **kwargs): 16 | super(IndexView, self).__init__(*args, **kwargs) 17 | self.num_banks = Report.objects.num_banks() 18 | 19 | 20 | def get_queryset(self): 21 | return IndexableQuery(self.num_banks, Report.objects.most_recent) 22 | 23 | 24 | def get_context_data(self, **kwargs): 25 | context = super(IndexView, self).get_context_data(**kwargs) 26 | context['num_banks'] = self.num_banks 27 | return context 28 | 29 | 30 | 31 | class BankView(ListView): 32 | template_name = 'bank.html' 33 | context_object_name = 'reports' 34 | 35 | 36 | def get_queryset(self): 37 | qs = Report.objects \ 38 | .filter(idrssd=self.kwargs['idrssd']) \ 39 | .order_by('-date') 40 | 41 | if not qs.exists(): 42 | raise Http404 43 | 44 | return list(qs) 45 | -------------------------------------------------------------------------------- /web/reports/management/commands/load.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datetime import datetime 4 | from django.core.management import BaseCommand, CommandError 5 | from optparse import make_option 6 | 7 | from reports.models import Report 8 | 9 | 10 | 11 | class Command(BaseCommand): 12 | help = 'load bank data into database' 13 | 14 | 15 | option_list = BaseCommand.option_list + ( 16 | make_option('-f', '--filename', help='input filename'), 17 | ) 18 | 19 | 20 | def handle(self, *args, **kwargs): 21 | filename = kwargs['filename'] 22 | 23 | if filename is None: 24 | raise CommandError('filename') 25 | 26 | Report.objects.all().delete() 27 | 28 | with open(filename, 'rb') as f: 29 | for bank_json in map(json.loads, f): 30 | self.load_bank(bank_json) 31 | 32 | 33 | def load_bank(self, bank_json): 34 | idrssd = bank_json['IDRSSD'] 35 | print('loading bank {}'.format(idrssd)) 36 | 37 | reports = Report.objects.bulk_create([ 38 | self.construct_report(idrssd, r) for r in bank_json['reports'] 39 | ]) 40 | print('loaded {} reports'.format(len(list(reports)))) 41 | 42 | 43 | def construct_report(self, idrssd, report_json): 44 | report_json['date'] = datetime.strptime( 45 | report_json['date'], 46 | '%m%d%Y' 47 | ).date() 48 | report_json['zipcode'] = report_json.pop('zip') 49 | 50 | return Report(idrssd=idrssd, **report_json) 51 | -------------------------------------------------------------------------------- /web/reports/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name='Bank', 15 | fields=[ 16 | ('idrssd', models.PositiveIntegerField(serialize=False, primary_key=True)), 17 | ], 18 | options={ 19 | }, 20 | bases=(models.Model,), 21 | ), 22 | migrations.CreateModel( 23 | name='Report', 24 | fields=[ 25 | ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), 26 | ('date', models.DateField()), 27 | ('name', models.CharField(max_length=255)), 28 | ('address', models.CharField(max_length=255)), 29 | ('city', models.CharField(max_length=255)), 30 | ('state', models.CharField(max_length=16)), 31 | ('zipcode', models.PositiveIntegerField()), 32 | ('assets', models.PositiveIntegerField()), 33 | ('deposits', models.PositiveIntegerField()), 34 | ('liabilities', models.PositiveIntegerField()), 35 | ('bank', models.ForeignKey(related_name=b'reports', to='reports.Bank')), 36 | ], 37 | options={ 38 | }, 39 | bases=(models.Model,), 40 | ), 41 | ] 42 | -------------------------------------------------------------------------------- /web/reports/templates/bank.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | {% load filters %} 3 | 4 | 5 | {% block title %}FDIC Call Report Browser{% endblock %} 6 | 7 | 8 | {% block head %} 9 | 15 | {% endblock %} 16 | 17 | 18 | {% block body %} 19 | {% with reports|last as last_report %} 20 |
21 |
22 |
23 |

FDIC Call Report Browser

24 |

{{ last_report.name }}

25 |

{{ last_report.address.strip }}, {{ last_report.city.strip }}, {{ last_report.state.strip }} {{ last_report.zipcode }}

26 |
27 |
28 |
29 |
30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {% for report in reports %} 41 | 42 | 43 | 44 | 45 | 46 | 47 | {% endfor %} 48 | 49 |
Report DateAssetsDepositsLiabilities
{{ report.date }}{{ report.assets|money:1000 }}M{{ report.deposits|money:1000 }}M{{ report.liabilities|money:1000 }}M
50 |
51 |
52 |
53 | {% endwith %} 54 | {% endblock %} 55 | -------------------------------------------------------------------------------- /web/reports/models.py: -------------------------------------------------------------------------------- 1 | from django.db import connection, models 2 | 3 | from reports.util import model_unicode 4 | 5 | 6 | 7 | class ReportManager(models.Manager): 8 | def most_recent(self, offset=None, limit=None): 9 | inner_query = ''' 10 | SELECT 11 | idrssd, 12 | name, 13 | max(date) max_date 14 | FROM 15 | reports_report 16 | GROUP BY 17 | idrssd 18 | ORDER BY 19 | name ASC 20 | '''.strip() 21 | 22 | if limit: 23 | inner_query += ' LIMIT {}'.format(int(limit)) 24 | if offset: 25 | inner_query += ' OFFSET {}'.format(int(offset)) 26 | 27 | query = ''' 28 | SELECT 29 | * 30 | FROM 31 | reports_report r1 32 | INNER JOIN ( 33 | {inner_query} 34 | ) r2 35 | ON 36 | r1.idrssd = r2.idrssd AND 37 | r1.date = r2.max_date 38 | ORDER BY 39 | r2.name ASC 40 | '''.strip().format( 41 | inner_query=inner_query 42 | ) 43 | 44 | return self.model.objects.raw(query) 45 | 46 | 47 | def num_banks(self): 48 | return len(self.values_list('idrssd', flat=True).distinct()) 49 | 50 | 51 | 52 | class Report(models.Model): 53 | idrssd = models.PositiveIntegerField() 54 | date = models.DateField() 55 | name = models.CharField(max_length=255) 56 | 57 | address = models.CharField(max_length=255) 58 | city = models.CharField(max_length=255) 59 | state = models.CharField(max_length=16) 60 | zipcode = models.PositiveIntegerField() 61 | 62 | assets = models.PositiveIntegerField() 63 | deposits = models.PositiveIntegerField() 64 | liabilities = models.PositiveIntegerField() 65 | 66 | 67 | objects = ReportManager() 68 | 69 | 70 | def __unicode__(self): 71 | return model_unicode(self, ( 72 | 'idrssd', 'date', 'name', 'address', 'city', 'state', 'zipcode', 73 | 'assets', 'deposits', 'liabilities' 74 | )) 75 | -------------------------------------------------------------------------------- /web/settings/dev.py: -------------------------------------------------------------------------------- 1 | import os, socket 2 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 3 | 4 | 5 | SECRET_KEY = '4h=6tp37*3c&92f$y00%!r4+s!l*w*iij07n-cjqk&tuf=%3wa' 6 | 7 | DEBUG = True 8 | TEMPLATE_DEBUG = True 9 | 10 | ALLOWED_HOSTS = [] 11 | 12 | INSTALLED_APPS = ( 13 | 'django.contrib.contenttypes', 14 | 'django.contrib.staticfiles', 15 | 16 | 'debug_toolbar', 17 | 18 | 'reports', 19 | ) 20 | 21 | MIDDLEWARE_CLASSES = ( 22 | 'debug_toolbar.middleware.DebugToolbarMiddleware', 23 | 'django.middleware.common.CommonMiddleware', 24 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 25 | ) 26 | 27 | ROOT_URLCONF = 'reports.urls' 28 | WSGI_APPLICATION = 'wsgi.application' 29 | 30 | DATABASES = { 31 | 'default': { 32 | 'ENGINE': 'django.db.backends.mysql', 33 | 'NAME': 'django', 34 | 'HOST': os.environ['DJANGO_RDS_HOST'], 35 | 'USER': os.environ['DJANGO_RDS_USER'], 36 | 'PASSWORD': os.environ['DJANGO_RDS_PASSWORD'], 37 | 'PORT': 3306, 38 | }, 39 | } 40 | 41 | LANGUAGE_CODE = 'en-us' 42 | TIME_ZONE = 'UTC' 43 | USE_I18N = True 44 | USE_L10N = True 45 | USE_TZ = True 46 | 47 | STATIC_URL = '/static/' 48 | 49 | LOGGING = { 50 | 'version': 1, 51 | 'disable_existing_loggers': True, 52 | 'formatters': { 53 | 'standard': { 54 | 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 55 | }, 56 | }, 57 | 'handlers': { 58 | 'console': { 59 | 'level': 'DEBUG', 60 | 'class': 'logging.StreamHandler', 61 | 'formatter': 'standard', 62 | }, 63 | }, 64 | 'loggers': { 65 | 'reports': { 66 | 'handlers': ['console'], 67 | 'level': 'DEBUG', 68 | } 69 | }, 70 | } 71 | 72 | try: 73 | socket.create_connection(('localhost', 11211)) 74 | except socket.error: 75 | CACHES = { 76 | 'default': { 77 | 'BACKEND': 'django.core.cache.backends.dummy.DummyCache', 78 | } 79 | } 80 | else: 81 | CACHES = { 82 | 'default': { 83 | 'BACKEND': 'django.core.cache.backends.memcached.MemcachedCache', 84 | 'LOCATION': ['localhost:11211',] 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /web/settings/prod.py: -------------------------------------------------------------------------------- 1 | import os 2 | BASE_DIR = os.path.dirname(os.path.dirname(__file__)) 3 | 4 | 5 | SECRET_KEY = '4h=6tp37*3c&92f$y00%!r4+s!l*w*iij07n-cjqk&tuf=%3wa' 6 | 7 | DEBUG = False 8 | TEMPLATE_DEBUG = False 9 | 10 | ALLOWED_HOSTS = [ 11 | '.chosak.org', 12 | ] 13 | 14 | INSTALLED_APPS = ( 15 | 'django.contrib.contenttypes', 16 | 'django.contrib.staticfiles', 17 | 'reports', 18 | ) 19 | 20 | MIDDLEWARE_CLASSES = ( 21 | 'django.middleware.common.CommonMiddleware', 22 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 23 | ) 24 | 25 | ROOT_URLCONF = 'reports.urls' 26 | WSGI_APPLICATION = 'wsgi.application' 27 | 28 | DATABASES = { 29 | 'default': { 30 | 'ENGINE': 'django.db.backends.mysql', 31 | 'NAME': 'django', 32 | 'HOST': os.environ['DJANGO_RDS_HOST'], 33 | 'USER': os.environ['DJANGO_RDS_USER'], 34 | 'PASSWORD': os.environ['DJANGO_RDS_PASSWORD'], 35 | 'PORT': 3306, 36 | }, 37 | } 38 | 39 | LANGUAGE_CODE = 'en-us' 40 | TIME_ZONE = 'UTC' 41 | USE_I18N = True 42 | USE_L10N = True 43 | USE_TZ = True 44 | 45 | STATIC_URL = '/static/' 46 | 47 | LOGGING = { 48 | 'version': 1, 49 | 'disable_existing_loggers': True, 50 | 'formatters': { 51 | 'standard': { 52 | 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 53 | }, 54 | }, 55 | 'handlers': { 56 | 'console': { 57 | 'level': 'DEBUG', 58 | 'class': 'logging.StreamHandler', 59 | 'formatter': 'standard', 60 | }, 61 | 'file': { 62 | 'level':'INFO', 63 | 'class':'logging.handlers.RotatingFileHandler', 64 | 'filename': '/var/log/wsgi/app.log', 65 | 'maxBytes': 1024*1024*5, # 5 MB 66 | 'backupCount': 5, 67 | 'formatter':'standard', 68 | }, 69 | 'error': { 70 | 'level': 'ERROR', 71 | 'class':'logging.handlers.RotatingFileHandler', 72 | 'filename': '/var/log/wsgi/error.log', 73 | 'maxBytes': 1024*1024*5, # 5 MB 74 | 'backupCount': 5, 75 | 'formatter':'standard', 76 | }, 77 | }, 78 | 'loggers': { 79 | '': { 80 | 'handlers': ['error'], 81 | 'level': 'ERROR', 82 | 'propagate': True, 83 | }, 84 | 'reports': { 85 | 'handlers': ['console', 'file'], 86 | 'level': 'DEBUG', 87 | 'propagate': True, 88 | } 89 | }, 90 | } 91 | 92 | memcached_server = os.environ.get('MEMCACHED_SERVER') 93 | if memcached_server: 94 | CACHES = { 95 | 'default': { 96 | 'BACKEND': 'django.core.cache.backends.memcached.MemcachedCache', 97 | 'LOCATION': [ 98 | memcached_server, 99 | ] 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /bin/mrjob-transform.py: -------------------------------------------------------------------------------- 1 | import mrjob.compat 2 | import mrjob.conf 3 | import mrjob.job 4 | import mrjob.protocol 5 | import mrjob.step 6 | import os 7 | import re 8 | 9 | 10 | 11 | class BankMetadataJob(mrjob.job.MRJob): 12 | INPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol 13 | OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol 14 | 15 | 16 | def steps(self): 17 | return [ 18 | mrjob.step.MRStep( 19 | mapper=self.mapper, 20 | reducer=self.date_reducer 21 | ), 22 | mrjob.step.MRStep( 23 | reducer=self.bank_reducer 24 | ), 25 | ] 26 | 27 | 28 | def mapper(self, _, row): 29 | bank_id = row.pop('"IDRSSD"', None) 30 | 31 | if bank_id is None: 32 | self.increment_counter('mapper', 'no_id') 33 | return 34 | 35 | filename = mrjob.compat.get_jobconf_value('map.input.file') 36 | 37 | if 'Bulk POR' in filename: 38 | data = self.get_bank_info(row) 39 | elif 'Schedule RC ' in filename: 40 | data = self.get_balance_sheet_info(row) 41 | else: 42 | raise ValueError('unsupported filename: {}'.format(filename)) 43 | 44 | date = self.get_filename_date(filename) 45 | 46 | yield (bank_id, date), data 47 | 48 | 49 | def get_filename_date(self, filename): 50 | filename = os.path.basename(filename) 51 | pattern = ( 52 | 'FFIEC CDR Call (Bulk|Schedule) (\w+) (\d{8})(\(\d+ of \d+\))?.txt' 53 | ) 54 | match = re.compile(pattern).match(filename) 55 | 56 | if not match or 3 > len(match.groups()): 57 | raise ValueError('bad filename: {}'.format(filename)) 58 | 59 | return match.group(3) 60 | 61 | 62 | def get_bank_info(self, row): 63 | return { 64 | 'name': row['Financial Institution Name'], 65 | 'address': row['Financial Institution Address'], 66 | 'city': row['Financial Institution City'], 67 | 'state': row['Financial Institution State'], 68 | 'zip': row['Financial Institution Zip Code'], 69 | } 70 | 71 | 72 | def get_balance_sheet_info(self, row): 73 | return { 74 | 'assets': row['TOTAL ASSETS'], 75 | 'deposits': row['TOTAL DEPOSITS'], 76 | 'liabilities': row['TOTAL LIABILITIES'], 77 | } 78 | 79 | 80 | def date_reducer(self, map_key, rows): 81 | bank_id, date = map_key 82 | 83 | data = mrjob.conf.combine_dicts(*rows) 84 | data['date'] = date 85 | 86 | yield bank_id, data 87 | 88 | 89 | def bank_reducer(self, bank_id, rows): 90 | data = { 91 | 'IDRSSD': bank_id, 92 | 'reports': sorted(rows, key=lambda r: r['date']) 93 | } 94 | 95 | self.increment_counter('banks', 'output') 96 | yield bank_id, data 97 | 98 | 99 | 100 | if __name__ == '__main__': 101 | # Usage: python jobs/banks.py \ 102 | # input/*/*Bulk*.txt 103 | # input/*/*RC\ *.txt 104 | # > output.txt 105 | BankMetadataJob.run() 106 | -------------------------------------------------------------------------------- /web/reports/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | 4 | {% block title %}FDIC Call Report Browser{% endblock %} 5 | 6 | 7 | {% block head %} 8 | 14 | {% endblock %} 15 | 16 | 17 | {% block body %} 18 |
19 |
20 |
21 |

FDIC Call Report Browser

22 |

{{ num_banks }} banks

23 |
24 |
25 | {% if is_paginated %} 26 |
27 |
28 | 43 |
44 |
45 | {% endif %} 46 |
47 |
48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | {% for bank in banks %} 58 | 59 | 62 | 63 | 64 | 65 | {% endfor %} 66 | 67 |
IDRSSDNameLast Report
60 | {{ bank.idrssd }} 61 | {{ bank.name }}{{ bank.date }}
68 |
69 |
70 | {% if is_paginated %} 71 |
72 |
73 | 88 |
89 |
90 | {% endif %} 91 |
92 | {% endblock %} 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Domestically-chartered banks regulated by the FDIC are required to file quarterly financial reports to regulators, known colloquially as "[call reports](http://en.wikipedia.org/wiki/Call_report)". This data is made publically available [by the FFIEC](https://cdr.ffiec.gov/public/) but is difficult to download efficiently and hard to analyze in aggregate. 2 | 3 | This project aims to simplify data access and open up new methods for analysis. 4 | 5 | It currently consists of 3 major pieces: 6 | 7 | 1. A downloader to retrieve all bulk call report data, 8 | 2. A MapReduce job that processes raw data to produce aggregate data per bank, and 9 | 3. A Django-powered web application that allows for exploration and review of aggregated data 10 | 11 | #### Downloading raw call report data 12 | 13 | While the current [FFIEC download site](https://cdr.ffiec.gov/public/) does offer minimal bulk downloading capabilities, it's not easy to retrieve a detailed set of all call report data over time. Without an API, the only way to do this is to use a browser to retrieve each dataset. 14 | 15 | The `bin/download_all_reports.py` script automates this process by using [Selenium](http://www.seleniumhq.org) to iteratively download all available single quarterly call reports. See the comments in that script for instructions on how to setup Selenium locally. 16 | 17 | Files downloaded from this script are stored in their original archive format (`.zip`). 18 | 19 | #### Preparing call report data for MapReduce 20 | 21 | The `bin/extract.py` script extracts all raw text files from the downloaded call report archives and prepares them for processing by MapReduce. Each file is converted to line-JSON format, and optionally stored in a bucket on S3. Files are organized first by report date and then by report type. 22 | 23 | Note that all code that leverages AWS resources like S3 follows standard convention and assumes that you have your `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables set. 24 | 25 | #### Running MapReduce 26 | 27 | A MapReduce job is written to handle the numerous input files, group data by report type and bank, and output in a normalized way.The excellent [mrjob](http://mrjob.readthedocs.org/en/latest/index.html) library is used so that the job can be written in Python. 28 | 29 | See `bin/mrjob-transform.py` for the code; it currently handles two report types: "Bulk POR" (general bank metadata) and "Schedule RC" (balance sheet information). 30 | 31 | To run locally, you can do something like: 32 | 33 | ```sh 34 | python jobs/banks.py input/*/*Bulk*.txt input/*/*RC\ *.txt > output.txt 35 | ``` 36 | 37 | Optionally, you can also run on Elastic MapReduce. See mrjob [documentation](http://mrjob.readthedocs.org/en/latest/guides/emr-quickstart.html) for details. 38 | 39 | #### Loading data into MySQL 40 | 41 | MySQL is used as a datastore for the Django web application. A loader script is provided to read in the file generated in the previous MapReduce step. 42 | 43 | To import data, simply run: 44 | 45 | ```sh 46 | ./manage.py load -f output.txt 47 | ``` 48 | 49 | This requires that you have the `DJANGO_RDS_HOST`, `DJANGO_RDS_USER`, and `DJANGO_RDS_PASSWORD` environment variables set to appropriate database credentials. 50 | 51 | #### The Django web application 52 | 53 | The Django web application allows for browsing of all data generated in previous steps. 54 | 55 | The home page shows a list of all banks over all time, alphabetized by name. 56 | 57 | ![index](http://i.imgur.com/b8Oira5l.png) 58 | 59 | Each bank's ID links to a page displaying the details of its balance sheet over time. 60 | 61 | ![bank](http://i.imgur.com/Q3CfGHJl.png) 62 | -------------------------------------------------------------------------------- /bin/download_all_reports.py: -------------------------------------------------------------------------------- 1 | import logging, os, time 2 | from contextlib import contextmanager 3 | from selenium import webdriver 4 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 5 | from selenium.webdriver.support.ui import Select 6 | 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | SELENIUM_SERVER = os.environ.get( 13 | 'SELENIUM_SERVER', 14 | 'http://localhost:4444/wd/hub' 15 | ) 16 | 17 | SELENIUM_BROWSER = os.environ.get( 18 | 'SELENIUM_BROWSER', 19 | 'CHROME' 20 | ) 21 | 22 | 23 | def run(): 24 | ''' 25 | Uses a web browser to download all available FDIC call report data from the 26 | URL defined as DOWNLOADS_PAGE above. 27 | 28 | Uses Selenium which must be running using, e.g. 29 | 30 | java -jar selenium-server-standalone-2.43.1.jar 31 | 32 | By default the script tries to connect to a server running locally on 33 | port 4444, but this may be overridden through use of the SELENIUM_SERVER 34 | environment variable. 35 | 36 | The standalone Selenium server tries to use the default system browser. 37 | To use a different browser like Chrome, add this command-line option to 38 | the java call: 39 | 40 | -Dwebdriver.chrome.driver=/path/to/chromedriver 41 | 42 | Also set the SELENIUM_BROWSER environment variable to CHROME. 43 | 44 | This program triggers downloads in tab-delimited format which get 45 | saved to the default browser download location. 46 | ''' 47 | with selenium_driver() as driver: 48 | count = 0 49 | 50 | while True: 51 | logger.info('navigating to data download page') 52 | driver.get('https://cdr.ffiec.gov/public/PWS/DownloadBulkData.aspx') 53 | 54 | logger.info('setting download type to single period') 55 | dl_type = Select(driver.find_element_by_id('ListBox1')) 56 | dl_type.select_by_value('ReportingSeriesSinglePeriod') 57 | time.sleep(3) 58 | 59 | logger.info('finding available reporting periods') 60 | periods = Select(driver.find_element_by_id('DatesDropDownList')) 61 | 62 | if not count: 63 | logger.info('{} available reporting periods: {}'.format( 64 | len(periods.options), 65 | ', '.join([period.text for period in periods.options]) 66 | )) 67 | 68 | if count == len(periods.options): 69 | break 70 | 71 | period = periods.options[count] 72 | logger.info('downloading data for period {}'.format(period.text)) 73 | 74 | periods.select_by_index(count) 75 | time.sleep(3) 76 | 77 | submit_button = driver.find_element_by_id('Download_0') 78 | submit_button.click() 79 | time.sleep(3) 80 | 81 | count += 1 82 | 83 | logger.info('waiting for last download to finish') 84 | time.sleep(30) 85 | 86 | 87 | @contextmanager 88 | def selenium_driver(): 89 | logger.info('connecting to local Selenium server at {}'.format( 90 | SELENIUM_SERVER 91 | )) 92 | capabilities = getattr(DesiredCapabilities, SELENIUM_BROWSER) 93 | 94 | driver = webdriver.Remote( 95 | SELENIUM_SERVER, 96 | desired_capabilities=capabilities 97 | ) 98 | 99 | try: 100 | driver.implicitly_wait(10) 101 | driver.set_page_load_timeout(10) 102 | 103 | yield driver 104 | finally: 105 | logger.info('disconnecting from local Selenium server') 106 | driver.quit() 107 | 108 | 109 | if __name__ == '__main__': 110 | run() 111 | -------------------------------------------------------------------------------- /bin/extract.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import boto 5 | import csv 6 | import json 7 | import os 8 | import re 9 | import shutil 10 | 11 | from glob import glob 12 | from tempfile import NamedTemporaryFile 13 | from zipfile import ZipFile 14 | 15 | 16 | 17 | class Extractor(object): 18 | def __init__(self, input_dir, output_dir): 19 | self.input_dir = input_dir 20 | self.output_dir = output_dir 21 | 22 | 23 | def extract(self, convert_to_json=True): 24 | for zfn in glob(os.path.join(self.input_dir, 'FFIEC*.zip')): 25 | date_match = re.compile('.*(\d{8}).zip').match(zfn) 26 | date = date_match.groups()[0] 27 | print('extracting {} reports from {}'.format(date, zfn)) 28 | 29 | report_path = os.path.join(self.output_dir, date) 30 | if not os.path.exists(report_path): 31 | os.makedirs(report_path) 32 | 33 | count = 0 34 | with ZipFile(zfn, 'r') as z: 35 | for fn in z.namelist(): 36 | if 'Readme.txt' == fn: 37 | continue 38 | 39 | efn = z.extract(fn, report_path) 40 | count += 1 41 | 42 | if convert_to_json: 43 | self.convert_to_json(efn) 44 | 45 | print('extracted {} reports to {}'.format(count, report_path)) 46 | 47 | 48 | def upload_to_s3(self, bucket, path): 49 | try: 50 | conn = boto.connect_s3() 51 | bucket = conn.get_bucket(bucket) 52 | except Exception: 53 | print( 54 | 'Could not connect to S3 bucket.\n' 55 | 'Make sure that AWS environment variables are set.\n' 56 | 'AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY' 57 | ) 58 | return 59 | 60 | for root, directories, filenames in os.walk(self.output_dir): 61 | for filename in filenames: 62 | full_filename = os.path.join(root, filename) 63 | keyname = full_filename.lstrip(self.output_dir) 64 | k = bucket.new_key(os.path.join(path, keyname)) 65 | 66 | print('uploading {} to s3n://{}/{}'.format( 67 | full_filename, 68 | bucket.name, 69 | k.name 70 | )) 71 | k.set_contents_from_filename(full_filename, replace=True) 72 | 73 | 74 | def convert_to_json(self, filename): 75 | with NamedTemporaryFile() as tf: 76 | with open(tf.name, 'wb') as outf: 77 | with open(filename, 'rb') as inf: 78 | reader = csv.reader( 79 | inf, 80 | delimiter='\t', 81 | quoting=csv.QUOTE_NONE 82 | ) 83 | 84 | keys = reader.next() 85 | 86 | if 'Bulk POR' not in filename: 87 | keys = reader.next() 88 | keys[0] = '"IDRSSD"' 89 | 90 | for line in reader: 91 | d = {k: v for k, v in zip(keys, line)} 92 | data = json.dumps(d) 93 | outf.write(data + '\n') 94 | 95 | shutil.copyfile(tf.name, filename) 96 | 97 | 98 | 99 | if '__main__' == __name__: 100 | parser = argparse.ArgumentParser(description='extract archive data') 101 | parser.add_argument('-i', '--input-dir', help='input archive directory') 102 | parser.add_argument('-o', '--output-dir', help='output archive directory') 103 | parser.add_argument('--keep-raw-format', action='store_false', 104 | dest='convert_to_json', 105 | help='keep raw format (no json conversion)') 106 | parser.add_argument('--s3-bucket', help='s3 upload bucket (optional)') 107 | parser.add_argument('--s3-path', help='s3 upload path (optional)', 108 | default='') 109 | args = parser.parse_args() 110 | 111 | extractor = Extractor(args.input_dir, args.output_dir) 112 | extractor.extract(args.convert_to_json) 113 | 114 | if args.s3_bucket: 115 | extractor.upload_to_s3(args.s3_bucket, args.s3_path) 116 | --------------------------------------------------------------------------------