├── irsdb ├── irsdb │ ├── __init__.py │ ├── wsgi.py │ ├── urls.py │ ├── local_settings.py-example.py │ └── settings.py ├── filing │ ├── __init__.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ ├── enter_new_results.py │ │ │ ├── find_new_filings.py │ │ │ ├── make_manifest.py │ │ │ └── enter_yearly_submissions.py │ └── models.py ├── metadata │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── templatetags │ │ └── xpath_url.py │ ├── urls.py │ ├── irsx_utils.py │ ├── management │ │ └── commands │ │ │ ├── run_bake.py │ │ │ ├── load_metadata.py │ │ │ └── generate_schemas_from_metadata.py │ ├── models.py │ └── views.py ├── return │ ├── __init__.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ ├── drop_indexes.py │ │ │ ├── make_indexes.py │ │ │ ├── remove_year.py │ │ │ ├── remove_half_loaded.py │ │ │ ├── load_filings.py │ │ │ └── load_filings_multithreaded.py │ └── sql │ │ ├── show_db_counts.sql │ │ └── delete_all_return.sql ├── schemas │ ├── __init__.py │ ├── field_utils.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ └── find_empty_heads.py │ ├── documentation_utils.py │ ├── model_accumulator.py │ └── type_utils.py ├── generated_schemas │ └── blank.txt ├── templates │ ├── 404.html │ ├── 500.html │ ├── metadata │ │ ├── group.html │ │ ├── part.html │ │ ├── variable.html │ │ ├── forms.html │ │ └── xpath.html │ └── base.html ├── setup.sh ├── static │ ├── css │ │ ├── ie10-viewport-bug-workaround.css │ │ └── irsx.css │ └── js │ │ ├── ie10-viewport-bug-workaround.js │ │ └── bootstrap.min.js ├── manage.py ├── stream_extractor.py └── dump_from_manifest.py ├── requirements.txt ├── LICENSE ├── .gitignore ├── setup_supporting_tables.sh ├── directors.sh ├── grants.sh ├── contractors.sh ├── README.md └── sked_l.sh /irsdb/irsdb/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/filing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/return/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/generated_schemas/blank.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/filing/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/return/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/schemas/field_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /irsdb/schemas/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/filing/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/return/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/schemas/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irsdb/metadata/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /irsdb/metadata/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class MetadataConfig(AppConfig): 5 | name = 'metadata' 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This means we don't support python<3 2 | Django>=2.0.1 3 | 4 | # for postgres: 5 | psycopg2 6 | 7 | # for mysql 8 | # mysqlclient 9 | 10 | # 11 | irsx 12 | requests 13 | unidecode 14 | 15 | -------------------------------------------------------------------------------- /irsdb/templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}404{% endblock %} 4 | 5 | {% block content %} 6 |
7 |

404: File Not Found

8 |
9 | {% endblock %} -------------------------------------------------------------------------------- /irsdb/templates/500.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}500: Server Error{% endblock %} 4 | {% block content %} 5 |
6 |

500: Server Error

7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /irsdb/return/sql/show_db_counts.sql: -------------------------------------------------------------------------------- 1 | 2 | -- from: https://stackoverflow.com/a/2611745 -- from stats collector rather than live tables, but quick to run 3 | SELECT schemaname,relname,n_live_tup 4 | FROM pg_stat_user_tables 5 | ORDER BY n_live_tup DESC; -------------------------------------------------------------------------------- /irsdb/metadata/templatetags/xpath_url.py: -------------------------------------------------------------------------------- 1 | from django import template 2 | from django.template.defaultfilters import stringfilter 3 | 4 | register = template.Library() 5 | 6 | @register.filter 7 | @stringfilter 8 | def xpath_url(value): 9 | return "/metadata/xpath/" + value.replace("/","-") + ".html" 10 | 11 | -------------------------------------------------------------------------------- /irsdb/setup.sh: -------------------------------------------------------------------------------- 1 | python3 manage.py makemigrations metadata 2 | python3 manage.py migrate metadata 3 | python3 manage.py makemigrations filing 4 | python3 manage.py migrate filing 5 | python3 manage.py enter_yearly_submissions 2016 6 | #python3 manage.py enter_yearly_submissions 2016 --enter 7 | python3 manage.py generate_schemas_from_metadata 8 | python3 manage.py makemigrations return 9 | python3 manage.py migrate return 10 | -------------------------------------------------------------------------------- /irsdb/irsdb/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for irsdb project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "irsdb.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /irsdb/static/css/ie10-viewport-bug-workaround.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * IE10 viewport hack for Surface/desktop Windows 8 bug 3 | * Copyright 2014-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | 7 | /* 8 | * See the Getting Started docs for more information: 9 | * http://getbootstrap.com/getting-started/#support-ie10-width 10 | */ 11 | @-ms-viewport { width: device-width; } 12 | @-o-viewport { width: device-width; } 13 | @viewport { width: device-width; } 14 | -------------------------------------------------------------------------------- /irsdb/schemas/documentation_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unidecode 3 | 4 | BRACKET_RE = re.compile(r'\[.*?\]') 5 | 6 | def markupify(string): 7 | """ replace _ with \_ [ not need for all markup ] """ 8 | return string.replace("_","\_") 9 | 10 | def debracket(string): 11 | """ Eliminate the bracketed var names in doc, line strings """ 12 | result = re.sub(BRACKET_RE, ';', string) 13 | result = unidecode.unidecode(result) 14 | return result 15 | 16 | def most_recent(semicolon_delimited_string): 17 | result = semicolon_delimited_string.split(";")[-1] 18 | return result -------------------------------------------------------------------------------- /irsdb/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "irsdb.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError as exc: 10 | raise ImportError( 11 | "Couldn't import Django. Are you sure it's installed and " 12 | "available on your PYTHONPATH environment variable? Did you " 13 | "forget to activate a virtual environment?" 14 | ) from exc 15 | execute_from_command_line(sys.argv) 16 | -------------------------------------------------------------------------------- /irsdb/static/css/irsx.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 50px; 3 | } 4 | 5 | .body_template { 6 | padding-left: 15px; 7 | text-align: left; 8 | } 9 | 10 | .indent1 { 11 | padding-left: 30px; 12 | } 13 | 14 | .indent2 { 15 | padding-left: 60px; 16 | } 17 | 18 | .indent3 { 19 | padding-left: 90px; 20 | } 21 | 22 | .anchor { 23 | display:block; 24 | padding-top:50px; 25 | margin-top:-50px; 26 | } 27 | 28 | span.glyphicon-question-sign { 29 | font-size: 20px; 30 | padding: 0px; 31 | margin: 0px; 32 | } 33 | 34 | .top_matter { 35 | border-bottom: 2px solid #DDD; 36 | margin-bottom: 10px; 37 | } 38 | 39 | .emphasized { 40 | background-color: #F5F5F5; 41 | padding: 5px; 42 | margin: 0px; 43 | } -------------------------------------------------------------------------------- /irsdb/return/management/commands/drop_indexes.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from django.db import connection 3 | 4 | class Command(BaseCommand): 5 | help = ''' 6 | remove all filings from a given year by object id. This is faster if indexes are created already. 7 | ''' 8 | def handle(self, *args, **options): 9 | self.cursor = connection.cursor() 10 | 11 | all_tables = connection.introspection.table_names() 12 | for table in all_tables: 13 | if table.startswith('return'): 14 | index_name = "xx_%s" % table 15 | query = "drop index if exists %s" % (index_name) 16 | print("Running query: '%s' " % query) 17 | self.cursor.execute(query) 18 | -------------------------------------------------------------------------------- /irsdb/static/js/ie10-viewport-bug-workaround.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * IE10 viewport hack for Surface/desktop Windows 8 bug 3 | * Copyright 2014-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | 7 | // See the Getting Started docs for more information: 8 | // http://getbootstrap.com/getting-started/#support-ie10-width 9 | 10 | (function () { 11 | 'use strict'; 12 | 13 | if (navigator.userAgent.match(/IEMobile\/10\.0/)) { 14 | var msViewportStyle = document.createElement('style') 15 | msViewportStyle.appendChild( 16 | document.createTextNode( 17 | '@-ms-viewport{width:auto!important}' 18 | ) 19 | ) 20 | document.querySelector('head').appendChild(msViewportStyle) 21 | } 22 | 23 | })(); 24 | -------------------------------------------------------------------------------- /irsdb/filing/management/commands/enter_new_results.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import requests 4 | 5 | from django.core.management.base import BaseCommand 6 | from filing.models import Filing 7 | from irsx.settings import WORKING_DIRECTORY 8 | 9 | from os.path import isfile, join 10 | 11 | 12 | class Command(BaseCommand): 13 | help = ''' 14 | 15 | ''' 16 | 17 | 18 | 19 | 20 | def handle(self, *args, **options): 21 | infile = open('results.csv', 'r') 22 | years = {} 23 | for row in infile: 24 | 25 | try: 26 | this_year = int(row[0:4]) 27 | except ValueError: 28 | print("Skipping %s" % row) 29 | continue 30 | 31 | try: 32 | years[this_year] += 1 33 | except KeyError: 34 | years[this_year] = 1 35 | 36 | print(years) 37 | 38 | 39 | -------------------------------------------------------------------------------- /irsdb/irsdb/urls.py: -------------------------------------------------------------------------------- 1 | """irsdb URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/2.0/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | from django.contrib import admin 17 | from django.urls import include, path 18 | from django.views.generic import TemplateView 19 | 20 | urlpatterns = [ 21 | path('metadata/', include('metadata.urls')) 22 | ] 23 | -------------------------------------------------------------------------------- /irsdb/return/management/commands/make_indexes.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from django.db import connection 3 | 4 | class Command(BaseCommand): 5 | help = ''' 6 | remove all filings from a given year by object id. This is faster if indexes are created already. 7 | ''' 8 | def handle(self, *args, **options): 9 | self.cursor = connection.cursor() 10 | 11 | all_tables = connection.introspection.table_names() 12 | for table in all_tables: 13 | if table.startswith('return'): 14 | index_name = "xx_%s" % table 15 | if table.startswith('return_skdk'): 16 | query = "create index %s on %s (object_id, ein, \"documentId\")" % (index_name, table) 17 | 18 | else: 19 | query = "create index %s on %s (object_id, ein)" % (index_name, table) 20 | print("Running query: '%s' " % query) 21 | self.cursor.execute(query) -------------------------------------------------------------------------------- /irsdb/metadata/urls.py: -------------------------------------------------------------------------------- 1 | """irsdb URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/2.0/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | 17 | from django.urls import path, re_path 18 | from metadata import views 19 | 20 | 21 | urlpatterns = [ 22 | path(r'forms.html', views.show_forms), 23 | path(r'about.html', views.show_about), 24 | re_path(r'parts/(?P[\w\d]+).html$', views.show_part), 25 | re_path(r'groups/(?P[\w\d]+).html$', views.show_group), 26 | re_path(r'xpath/(?P.+).html', views.show_xpath), 27 | re_path(r'variable/(?P[\w\d\_]+)\-(?P[\w\d]+).html$', views.show_variable), 28 | 29 | ] 30 | 31 | -------------------------------------------------------------------------------- /irsdb/irsdb/local_settings.py-example.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # local variables read into settings.py. 4 | # variables set here will override whatever's in settings.py 5 | 6 | DATABASES = { 7 | 'default': { 8 | 'ENGINE': 'django.db.backends.postgresql_psycopg2', 9 | 'NAME': '', 10 | 'USER': '', 11 | 'PASSWORD': '', 12 | 'HOST': '', 13 | 'PORT': '', 14 | } 15 | } 16 | 17 | 18 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 19 | 20 | DEBUG=True 21 | ALLOWED_HOSTS = ['localhost',] 22 | 23 | FILE_SYSTEM_BASE = os.path.join(BASE_DIR, 'baked_site') 24 | USE_TZ = True 25 | TEMPLATE_ROOT = os.path.join(BASE_DIR, 'templates/') 26 | 27 | TEMPLATES = [ 28 | { 29 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 30 | 'DIRS': [TEMPLATE_ROOT,], 31 | 'APP_DIRS': False, 32 | 'OPTIONS': { 33 | 'context_processors': [ 34 | 'django.template.context_processors.debug', 35 | 'django.template.context_processors.request', 36 | 'django.contrib.auth.context_processors.auth', 37 | 'django.contrib.messages.context_processors.messages', 38 | ], 39 | }, 40 | }, 41 | ] 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Jacob Fenton 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /irsdb/templates/metadata/group.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% load xpath_url %} 3 | 4 | 5 | {% block title %}{{this_group.parent_sked }}: {{ this_group.db_name}}{% endblock %} 6 | 7 | {% block content %} 8 |
9 | 15 |

Repeating Group {{ this_group.db_name}}

16 |
17 | {% for variable in variables %} 18 |
19 |

Variable Name: {{variable.db_name}}

20 | 21 |

Variable Table: {{ this_group.db_name}} [ Repeating Group - {{this_group.parent_sked }}] {# This is confusing, but the table *is* the group's db_name #} 22 |
Line Number: {{variable.line_number}} 23 |
Description: {{variable.description}} 24 |
Type: {{variable.db_type}} 25 |
Xpath: {{variable.xpath}} 26 |

27 |
28 | {% endfor %} 29 | {% endblock %} 30 | -------------------------------------------------------------------------------- /irsdb/filing/management/commands/find_new_filings.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import requests 4 | 5 | from django.core.management.base import BaseCommand 6 | from filing.models import Filing 7 | from django.conf import settings 8 | from irsx.settings import WORKING_DIRECTORY 9 | from irsx.file_utils import stream_download 10 | 11 | from os.path import isfile, join 12 | 13 | 14 | class Command(BaseCommand): 15 | help = ''' 16 | Read the yearly csv file line by line and add new lines if 17 | they don't exist. Lines are added in bulk at the end. 18 | ''' 19 | 20 | 21 | def get_writer(self, headers): 22 | outfilehandle = open('results.csv', 'w') 23 | dw = csv.DictWriter(outfilehandle, headers, extrasaction='ignore') 24 | dw.writeheader() 25 | return dw 26 | 27 | def handle(self, *args, **options): 28 | print('reviewing findings in %s dir' % WORKING_DIRECTORY) 29 | headers = ['object_id',] 30 | writer = self.get_writer(headers) 31 | 32 | onlyfiles = [f for f in os.listdir(WORKING_DIRECTORY) if isfile(join(WORKING_DIRECTORY, f))] 33 | num_found = 0 34 | found_files = {} 35 | 36 | for file in onlyfiles: 37 | 38 | return_id = file.replace("_public.xml", "") 39 | 40 | try: 41 | this_filing = Filing.objects.get(object_id=return_id) 42 | except Filing.DoesNotExist: 43 | writer.writerow({'object_id': return_id}) 44 | except Filing.MultipleObjectsReturned: 45 | print("Multiple objects returned for %s " % return_id) 46 | 47 | num_found += 1 48 | print(found_files) 49 | print("Found a total of %s filings not entered" % num_found) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /irsdb/templates/metadata/part.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% load xpath_url %} 3 | 4 | {% block title %}Form Part Details: {{this_part.parent_sked }} {{ this_part.part_name}}{% endblock %} 5 | 6 | {% block content %} 7 |
8 | 13 |

Form {{this_part.parent_sked }}: {{ this_part.part_name}}

14 |
15 | {% if related_groups %} 16 |

Repeating groups from this part: 17 | {% for group in related_groups %} 18 | {{ group.db_name }}{% if forloop.last %}{% else %}, {% endif %} 19 | {% endfor %} 20 | {% endif %} 21 | {% if variables %} 22 | {% for variable in variables %} 23 |

24 |

Variable Name: {{variable.db_name}}

25 |

Variable Table: {{ this_part.parent_sked_part }} [ Schedule Part ] {# This is confusing, but the table *is* the parts's db_name #} 26 |
Line Number: {{variable.line_number}} 27 |
Description: {{variable.description}} 28 |
Type: {{variable.db_type}} 29 |
Xpath: {{variable.xpath}} 30 |

31 |
32 | {% endfor %} 33 | {% else %} 34 |

There are no non-repeating variables in this part--most likely the contents of this part is in a repeating group above.

35 | {% endif %} 36 | {% endblock %} 37 | -------------------------------------------------------------------------------- /irsdb/metadata/irsx_utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # field names for 'variables.csv' file 4 | VARIABLE_FIELDNAMES = [ 5 | 'parent_sked', 'parent_sked_part', 'in_a_group', 'db_table', 'ordering', 6 | 'db_name', 'xpath', 'irs_type', 'db_type', 'line_number', 'description', 7 | 'versions' 8 | ] 9 | 10 | GROUP_FIELDNAMES = [ 11 | 'parent_sked', 'parent_sked_part', 'ordering', 'xpath', 'db_name', 12 | 'line_number', 'description', 'headless', 'versions' 13 | ] 14 | 15 | SCHEDULE_PART_FIELDNAMES = [ 16 | 'parent_sked', 'parent_sked_part', 'ordering', 17 | 'part_name', 'xml_root', 'is_shell' 18 | ] 19 | 20 | def get_writer(outfilename, fieldnames): 21 | """ Returns a writer that writes to the csv 'spec' we use 22 | Keeping files consistent makes file diffs more readable. 23 | """ 24 | outfile = open(outfilename, 'w') # 'wb' python 2? 25 | writer = csv.DictWriter( 26 | outfile, 27 | fieldnames=fieldnames, 28 | delimiter=',', 29 | quotechar='"', 30 | lineterminator='\n', 31 | quoting=csv.QUOTE_MINIMAL 32 | ) 33 | writer.writeheader() 34 | return writer 35 | 36 | def get_variable_writer(outfilename): 37 | return get_writer(outfilename, VARIABLE_FIELDNAMES) 38 | 39 | def get_group_writer(outfilename): 40 | return get_writer(outfilename, GROUP_FIELDNAMES) 41 | 42 | def get_schedule_parts_writer(outfilename): 43 | return get_writer(outfilename, SCHEDULE_PART_FIELDNAMES) 44 | 45 | def clean_value(value): 46 | """ This gets run on every value """ 47 | value = value.lstrip(" ") # Remove leading whitespace 48 | if value=='NA': # Throw out NA's 49 | return '' 50 | return value 51 | 52 | def fix_row(rowdict): 53 | for key in rowdict.keys(): 54 | rowdict[key] = clean_value(rowdict[key]) 55 | return rowdict -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # PyCharm 104 | .idea/ 105 | 106 | # Migrations 107 | irsdb/filing/migrations/ 108 | irsdb/metadata/migrations/ 109 | irsdb/return/migrations/ 110 | -------------------------------------------------------------------------------- /irsdb/templates/metadata/variable.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | 4 | {% block title %}{{ this_variable.db_table }}-{{ this_variable.db_name }}{% endblock %} 5 | 6 | {% block content %} 7 |
8 | 18 |

Variable Details: {{ this_variable.db_name }}

19 |
20 |
21 |

DB Table: {{ this_variable.db_table }}

22 |

DB Name: {{ this_variable.db_name }}

23 | 24 |

Line number: {{this_variable.line_number }}

25 |

Description: {{this_variable.description }}

26 |

IRS Type: {{this_variable.irs_type }}

27 |

Repeating: {{this_variable.in_a_group }}

28 | 29 | {% if xpaths %} 30 |

Xpath details

31 | {% for xpath in xpaths %} 32 |

Xpath: {{xpath.xpath }}

33 |

Years: {{ xpath.version_start }} - {{ xpath.version_end }} 34 | 35 | {% endfor %} 36 | {% endif %} 37 |

38 | 39 | 40 | {% endblock %} 41 | -------------------------------------------------------------------------------- /irsdb/templates/metadata/forms.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block forms_active %}class="active"{% endblock %} 4 | 5 | {% block title %}Show all forms{% endblock %} 6 | 7 | {% block content %} 8 |
9 |

Form Index

10 |
11 |

The listing of form parts and repeating groups below describes the consistent structure and naming convention IRSx uses to represent nonprofit tax filings. Click on a form part or a repeating group named below to see the invidividual line items, their database table and names, and the line number and description given to them in the IRS' metadata. For more, see the about page.

12 |

Jump to: Header/ReturnHeader990x Forms 990 990EZ 990PF Lettered Schedules: A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, R

13 | 14 | {% for form in forms %} 15 | 16 |
17 |

{{form.sked_name}}

18 | {% for part in form.parts %} 19 |
20 |

{{ part.part.part_name }}

21 | {% for group in part.groups %} 22 |

Repeating group: {{ group }}

23 | {% endfor %} 24 |
25 | {% endfor %} 26 |
27 | {% endfor %} 28 | 29 | 30 | {% endblock %} 31 | -------------------------------------------------------------------------------- /irsdb/return/management/commands/remove_year.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from django.db import connection 3 | 4 | class Command(BaseCommand): 5 | help = ''' 6 | remove all filings from a given year by object id. This is faster if indexes are created already. 7 | ''' 8 | 9 | def add_arguments(self, parser): 10 | # Positional arguments 11 | parser.add_argument('year', nargs=1, type=int) 12 | 13 | def handle(self, *args, **options): 14 | self.cursor = connection.cursor() 15 | self.submission_year = int(options['year'][0]) 16 | 17 | BASE_QUERY = "(select object_id from filing_filing where submission_year=%s)" % self.submission_year 18 | 19 | all_tables = connection.introspection.table_names() 20 | for table in all_tables: 21 | if table.startswith('return'): 22 | query = "delete from %s where object_id in %s" % (table, BASE_QUERY) 23 | print("Running query: '%s' " % query) 24 | result = self.cursor.execute(query) 25 | print("Done '%s'\n" % result ) 26 | 27 | cmds = [ 28 | "update filing_filing set parse_started=False where parse_started = True and submission_year=%s" % self.submission_year, 29 | "update filing_filing set parse_complete=False where parse_complete = True and submission_year=%s" % self.submission_year, 30 | "update filing_filing set process_time=Null where not process_time is Null and submission_year=%s" % self.submission_year, 31 | "update filing_filing set is_error=False where is_error = True and submission_year=%s" % self.submission_year, 32 | "update filing_filing set key_error_count=Null where not key_error_count is Null and submission_year=%s" % self.submission_year, 33 | "update filing_filing set error_details =Null where not error_details is Null and submission_year=%s" % self.submission_year] 34 | 35 | for cmd in cmds: 36 | print("Running query: '%s' " % cmd) 37 | self.cursor.execute(cmd) 38 | -------------------------------------------------------------------------------- /irsdb/templates/metadata/xpath.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | 4 | {% block title %}Xpath {{this_variable.xpath }}{% endblock %} 5 | 6 | {% block content %} 7 | 8 |
9 | 19 |

Xpath: {{this_variable.xpath }}

20 |
21 |
22 |

DB Table: {{ this_variable.db_table }}

23 |

DB Name: {{this_variable.db_name }}

24 | 25 |

Line number: {{this_variable.line_number }}

26 |

Description: {{this_variable.description }}

27 |

IRS Type: {{this_variable.irs_type }}

28 |

Repeating: {{this_variable.in_a_group }}

29 |

Years: {{this_variable.version_start }}-{{this_variable.version_end }}

30 | 31 | {% if line_numbers %} 32 |

Line numbers:

33 | {% for line_number in line_numbers %} 34 |

Line number: {{line_number.line_number }}

35 |

Years: {{line_number.version_start }} - {{line_number.version_end }}

36 |
37 | {% endfor %} 38 | {% endif %} 39 | 40 | {% if descriptions %} 41 |

Descriptions

42 | {% for description in descriptions %} 43 |

Description: {{description.description }}

44 |
45 | {% endfor %} 46 | {% endif %} 47 | 48 | {% endblock %} 49 | -------------------------------------------------------------------------------- /irsdb/filing/management/commands/make_manifest.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from django.core.management.base import BaseCommand 4 | 5 | from filing.models import Filing 6 | 7 | foundation_manifest = "foundation_manifest.csv" 8 | ein_manifest = "ein_manifest.csv" 9 | 10 | output_file = "initial_manifest.csv" 11 | 12 | headers = ['ein', 'object_id', 'name', 'tax_period', 'form_type', 'is_most_recent', 'missing'] 13 | 14 | 15 | 16 | class Command(BaseCommand): 17 | help = ''' 18 | Read a source csv of eins, and find all filings. 19 | Record which is most recent report. 20 | Disregard form 990T. 21 | ''' 22 | 23 | def write_ein_details(self, ein): 24 | print("\n\nprocessing ein '%s'" % ein) 25 | filings = Filing.objects.filter(ein=ein).order_by('-tax_period', '-sub_date') 26 | first = 1 27 | if filings: 28 | for filing in filings: 29 | if filing.return_type == '990T': 30 | continue 31 | 32 | this_filing_dict = { 33 | 'name':filing.taxpayer_name, 34 | 'form_type': filing.return_type, 35 | 'ein': filing.ein, 36 | 'tax_period': filing.tax_period, 37 | 'is_most_recent':first, 38 | 'object_id':filing.object_id 39 | 40 | } 41 | print("'%s' - %s - %s - %s - %s - %s" % (filing.taxpayer_name, filing.return_type, filing.sub_date, filing.tax_period, filing.return_id, filing.object_id)) 42 | self.dw.writerow(this_filing_dict) 43 | first = 0 44 | else: 45 | this_filing_dict = { 46 | 'ein': ein, 47 | 'missing': 1 48 | } 49 | self.dw.writerow(this_filing_dict) 50 | 51 | 52 | def handle(self, *args, **options): 53 | 54 | outfilehandle = open(output_file, 'w') 55 | self.dw = csv.DictWriter(outfilehandle, headers, extrasaction='ignore') 56 | self.dw.writeheader() 57 | 58 | reader = open(foundation_manifest, 'r') 59 | for row in reader: 60 | self.write_ein_details(row.strip()) 61 | 62 | reader = open(ein_manifest, 'r') 63 | for row in reader: 64 | self.write_ein_details(row.strip()) 65 | 66 | -------------------------------------------------------------------------------- /irsdb/return/management/commands/remove_half_loaded.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from django.db import connection 3 | 4 | class Command(BaseCommand): 5 | help = ''' 6 | remove all filings from a given year that appear to be loading errors; where parse_start is true and 7 | parse_complete = False. 8 | ''' 9 | 10 | def add_arguments(self, parser): 11 | # Positional arguments 12 | parser.add_argument('year', nargs=1, type=int) 13 | 14 | def handle(self, *args, **options): 15 | self.cursor = connection.cursor() 16 | self.submission_year = int(options['year'][0]) 17 | 18 | BASE_QUERY = "(select object_id from filing_filing where parse_started=True and parse_complete=False and submission_year=%s)" % self.submission_year 19 | 20 | all_tables = connection.introspection.table_names() 21 | for table in all_tables: 22 | if table.startswith('return'): 23 | query = "delete from %s where object_id in %s" % (table, BASE_QUERY) 24 | print("Running query: '%s' " % query) 25 | result = self.cursor.execute(query) 26 | print("Done '%s'\n" % result ) 27 | 28 | cmds = [ 29 | "update filing_filing set parse_started=False where parse_started = True and parse_complete=False and submission_year=%s" % self.submission_year, 30 | "update filing_filing set parse_complete=False where parse_complete = True and parse_complete=False and submission_year=%s" % self.submission_year, 31 | "update filing_filing set process_time=Null where parse_started = True and not process_time is Null and parse_complete=False and submission_year=%s" % self.submission_year, 32 | "update filing_filing set is_error=False where parse_started = True and is_error = True and parse_complete=False and submission_year=%s" % self.submission_year, 33 | "update filing_filing set key_error_count=Null where parse_started = True and not key_error_count is Null and parse_complete=False and submission_year=%s" % self.submission_year, 34 | "update filing_filing set error_details =Null where parse_started = True and not error_details is Null and parse_complete=False and submission_year=%s" % self.submission_year] 35 | 36 | for cmd in cmds: 37 | print("Running query: '%s' " % cmd) 38 | self.cursor.execute(cmd) 39 | -------------------------------------------------------------------------------- /setup_supporting_tables.sh: -------------------------------------------------------------------------------- 1 | -- Write out reference charts: address and org_types 2 | 3 | DROP TABLE if exists address_table; 4 | 5 | SELECT 6 | return_returnheader990x_part_i.ein, 7 | return_returnheader990x_part_i.object_id, 8 | return_returnheader990x_part_i."RtrnHdr_TxPrdEndDt", 9 | return_returnheader990x_part_i."RtrnHdr_TxYr", 10 | return_returnheader990x_part_i."BsnssNm_BsnssNmLn1Txt", 11 | return_returnheader990x_part_i."BsnssNm_BsnssNmLn2Txt", 12 | return_returnheader990x_part_i."BsnssOffcr_PrsnNm", 13 | return_returnheader990x_part_i."BsnssOffcr_PrsnTtlTxt", 14 | return_returnheader990x_part_i."BsnssOffcr_PhnNm", 15 | return_returnheader990x_part_i."BsnssOffcr_EmlAddrssTxt", 16 | return_returnheader990x_part_i."BsnssOffcr_SgntrDt", 17 | return_returnheader990x_part_i."USAddrss_AddrssLn1Txt", 18 | return_returnheader990x_part_i."USAddrss_AddrssLn2Txt", 19 | return_returnheader990x_part_i."USAddrss_CtyNm", 20 | return_returnheader990x_part_i."USAddrss_SttAbbrvtnCd", 21 | return_returnheader990x_part_i."USAddrss_ZIPCd", 22 | return_returnheader990x_part_i."FrgnAddrss_AddrssLn1Txt", 23 | return_returnheader990x_part_i."FrgnAddrss_AddrssLn2Txt", 24 | return_returnheader990x_part_i."FrgnAddrss_CtyNm", 25 | return_returnheader990x_part_i."FrgnAddrss_PrvncOrSttNm", 26 | return_returnheader990x_part_i."FrgnAddrss_CntryCd" 27 | INTO address_table 28 | FROM return_returnheader990x_part_i; 29 | 30 | 31 | DROP INDEX IF EXISTS xx_990_address_oid_ein; 32 | CREATE INDEX xx_990_address_oid_ein ON address_table (object_id, ein); 33 | 34 | 35 | drop table if exists org_types; 36 | 37 | select distinct "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", ein, object_id, concat(ein, '/', object_id) as url_base into org_types from return_part_0; 38 | 39 | insert into org_types select distinct "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", ein, object_id, concat(ein, '/', object_id) as url_base from return_ez_part_0; 40 | 41 | insert into org_types("Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", ein, object_id, url_base) select distinct "Orgnztn501c3ExmptPFInd" as "Orgnztn501c3Ind", "Orgnztn501c3TxblPFInd" as "Orgnztn501cInd", "Orgnztn49471TrtdPFInd" as "Orgnztn49471NtPFInd", ein, object_id, concat(ein, '/', object_id) as url_base from return_pf_part_0; 42 | 43 | DROP INDEX IF EXISTS xx_990_entity_oid_ein; 44 | CREATE INDEX xx_990_entity_oid_ein ON org_types (object_id, ein); 45 | -------------------------------------------------------------------------------- /directors.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | -- 990 Employees 4 | 5 | 6 | DROP TABLE IF EXISTS tmp_990_employees; 7 | SELECT address_table.*, 8 | '/IRS990' as form, 9 | "PrsnNm", 10 | "TtlTxt", 11 | "RprtblCmpFrmOrgAmt" as "CmpnstnAmt" 12 | INTO temporary table tmp_990_employees 13 | FROM return_Frm990PrtVIISctnA 14 | LEFT JOIN address_table ON return_Frm990PrtVIISctnA.ein = address_table.ein 15 | AND return_Frm990PrtVIISctnA.object_id=address_table.object_id; 16 | 17 | DROP TABLE IF EXISTS tmp_990_employees_types; 18 | 19 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990_employees.* into temporary table tmp_990_employees_types from tmp_990_employees left join org_types on tmp_990_employees.object_id = org_types.object_id and tmp_990_employees.ein = org_types.ein; 20 | \copy tmp_990_employees_types to '/data/file_exports/990_employees.csv' with csv header; 21 | 22 | 23 | -- EZ 24 | 25 | 26 | 27 | DROP TABLE IF EXISTS tmp_990ez_employees; 28 | SELECT address_table.*, 29 | '/IRS990EZ' as form, 30 | "PrsnNm", 31 | "TtlTxt", 32 | "CmpnstnAmt" 33 | INTO temporary table tmp_990EZ_employees 34 | FROM return_EZOffcrDrctrTrstEmpl 35 | LEFT JOIN address_table ON return_EZOffcrDrctrTrstEmpl.ein = address_table.ein 36 | AND return_EZOffcrDrctrTrstEmpl.object_id= address_table.object_id; 37 | 38 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990EZ_employees.* into temporary table tmp_990EZ_employees_types from tmp_990EZ_employees left join org_types on tmp_990EZ_employees.object_id = org_types.object_id and tmp_990EZ_employees.ein = org_types.ein; 39 | \copy tmp_990EZ_employees_types to '/data/file_exports/990EZ_employees.csv' with csv header; 40 | 41 | 42 | -- PF 43 | 44 | 45 | DROP TABLE IF EXISTS tmp_990PF_employees; 46 | SELECT address_table.*, 47 | '/IRS990PF' as form, 48 | "OffcrDrTrstKyEmpl_PrsnNm" AS "PrsnNm", 49 | "OffcrDrTrstKyEmpl_TtlTxt" AS "TtlTxt", 50 | "OffcrDrTrstKyEmpl_CmpnstnAmt" AS "CmpnstnAmt" 51 | INTO temporary table tmp_990PF_employees 52 | FROM return_PFOffcrDrTrstKyEmpl 53 | LEFT JOIN address_table ON return_PFOffcrDrTrstKyEmpl.ein = address_table.ein 54 | AND return_PFOffcrDrTrstKyEmpl.object_id= address_table.object_id; 55 | 56 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990PF_employees.* into temporary table tmp_990PF_employees_types from tmp_990PF_employees left join org_types on tmp_990PF_employees.object_id = org_types.object_id and tmp_990PF_employees.ein = org_types.ein; 57 | \copy tmp_990PF_employees_types to '/data/file_exports/990PF_employees.csv' with csv header; 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /irsdb/metadata/management/commands/run_bake.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from django.core.management.base import BaseCommand 4 | from metadata.models import * 5 | 6 | from django.conf import settings 7 | 8 | import requests 9 | 10 | METADATA_DIRECTORY = settings.METADATA_DIRECTORY 11 | REPORT_COUNT = 100 12 | 13 | 14 | FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE 15 | 16 | 17 | class Command(BaseCommand): 18 | help = """ 19 | Bake the site out to files. 20 | """ 21 | 22 | def hit_url(self, url): 23 | print("Baking out url %s" % url) 24 | requests.get(url) 25 | 26 | def run_parts(self): 27 | all_parts = SchedulePart.objects.all() 28 | for part in all_parts: 29 | url = "http://localhost:8000/metadata/parts/" + part.parent_sked_part + ".html" 30 | self.hit_url(url) 31 | 32 | def run_groups(self): 33 | all_groups = Group.objects.all() 34 | for group in all_groups: 35 | url = "http://localhost:8000/metadata/groups/" + group.db_name + ".html" 36 | self.hit_url(url) 37 | 38 | def run_variables(self): 39 | #re_path(r'variable/(?P[\w\d\_]+)\-(?P[\w\d]+).html$', views.show_variable), 40 | all_variables = Variable.objects.all() 41 | for var in all_variables: 42 | var_url = "http://localhost:8000/metadata/variable/" + var.db_table + "-" + var.db_name + ".html" 43 | self.hit_url(var_url) 44 | xpath_url = "http://localhost:8000/metadata/xpath/"+ var.xpath.replace("/","-") + ".html" 45 | self.hit_url(xpath_url) 46 | 47 | 48 | def run_xpaths(self): 49 | all_xpaths = Variable.objects.all() 50 | for xpath in all_xpaths: 51 | print(xpath) 52 | #url = "http://localhost:8000/metadata/variable/" + var.db_table + "-" + var.db_name + ".html" 53 | #self.hit_url(url) 54 | 55 | def run_nav(self): 56 | self.hit_url("http://localhost:8000/metadata/about.html") 57 | self.hit_url("http://localhost:8000/metadata/forms.html") 58 | 59 | def create_dirs(self): 60 | for subdir in ["parts", "groups", "variable", "xpath"]: 61 | try: 62 | os.makedirs(os.path.join( FILE_SYSTEM_BASE, "metadata", subdir)) 63 | except FileExistsError: 64 | print("File %s exists skipping" % subdir) 65 | 66 | def handle(self, *args, **options): 67 | print("Baking out urls") 68 | self.create_dirs() 69 | self.run_nav() 70 | self.run_parts() 71 | self.run_groups() 72 | self.run_variables() 73 | 74 | """ 75 | re_path(r'xpath/(?P.+).html', views.show_xpath), 76 | re_path(r'variable/(?P[\w\d\_]+)\-(?P[\w\d]+).html$', views.show_variable), 77 | """ 78 | 79 | -------------------------------------------------------------------------------- /irsdb/templates/base.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {% block title %}{% endblock %} 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 28 | 29 | 30 | 31 | 32 | 49 | 50 |
51 | {% block content %} 52 | {% endblock %} 53 |
54 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /irsdb/schemas/management/commands/find_empty_heads.py: -------------------------------------------------------------------------------- 1 | import os, csv 2 | 3 | from django.core.management.base import BaseCommand 4 | from django.apps import apps 5 | 6 | from django.conf import settings 7 | 8 | from irsx.settings import METADATA_DIRECTORY 9 | 10 | class Command(BaseCommand): 11 | help = """ Find 'empty heads' with no values at all. 12 | """ 13 | 14 | def get_var_hash(self): 15 | variablefile = os.path.join(METADATA_DIRECTORY, 'variables.csv') 16 | variables = [] 17 | with open(variablefile, 'r') as variablefh: 18 | reader = csv.DictReader(variablefh) 19 | for row in reader: 20 | key = row['db_table'] + "_" + row['db_name'] 21 | variables.append({'key':key, 'xpath':row['xpath'], 'row':row}) 22 | #print("\t - %s" % key) 23 | self.variables = variables 24 | 25 | def find_children(self, key): 26 | results = [] 27 | for var in self.variables: 28 | if var['xpath'].startswith(key): 29 | results.append({'name': var['key'], 'xpath': var['xpath'], 'row':var['row']}) 30 | return results 31 | 32 | def find_match(self, key): 33 | for var in self.variables: 34 | if var['xpath'] == key: 35 | return var 36 | return None 37 | 38 | def find_empty_heads(self): 39 | count = 0 40 | for var in self.variables: 41 | key = var['xpath'] + "/" 42 | #print("Finding var %s" % key) 43 | children = self.find_children(key) 44 | if len(children) > 2: 45 | print("\n\nHandling xpath=%s" % var['xpath']) 46 | print("db_table %s; db_name:%s" % (var['row']['db_table'],var['row']['db_name'] )) 47 | print("Num children: %s" % (len(children))) 48 | 49 | 50 | print("select count(*) from return_%s where not \"%s\" is null;" % (var['row']['db_table'],var['row']['db_name'])) 51 | this_model = apps.get_model(app_label='return', model_name=var['row']['db_table']) 52 | if this_model: 53 | #print ("Got model %s name %s" % (this_model, var['row']['db_table'] )) 54 | pass 55 | else: 56 | print("model missing %s" % var['row']['db_table'] ) 57 | assert False 58 | 59 | # now see how many elements there are. 60 | fieldname = var['row']['db_name'] 61 | notnullcount = this_model.objects.filter(**{fieldname+'__isnull': False}).count() 62 | print("Count of this field is %s" % notnullcount) 63 | if notnullcount == 0: 64 | count += 1 65 | self.writer.writerow([var['xpath']]) 66 | 67 | print("Total suspected empty heads: %s" % count) 68 | 69 | 70 | def handle(self, *args, **options): 71 | outfile = open("emptyheads.csv", "w") 72 | 73 | self.writer = csv.writer(outfile) 74 | self.variables = None 75 | self.get_var_hash() 76 | self.find_empty_heads() 77 | -------------------------------------------------------------------------------- /irsdb/filing/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django.conf import settings 3 | import os 4 | import re 5 | 6 | from irsx import settings as irsx_settings 7 | XML_DIR = irsx_settings.WORKING_DIRECTORY 8 | 9 | VERSION_RE=re.compile(r'returnVersion="(20\d\dv\d\.\d)"') 10 | 11 | class Filing(models.Model): 12 | 13 | # This is set from the index file. 14 | submission_year = models.IntegerField(blank=False, null=False, default=0, help_text="Index file year") 15 | 16 | # Verbatim fields set from the csv file 17 | return_id = models.CharField(max_length=8, blank=False, null=False, default="", help_text="Return ID") 18 | filing_type = models.CharField(max_length=5, blank=False, null=False, default="", help_text="Always EFILE") 19 | ein = models.CharField(max_length=9, blank=False, null=False, default="", help_text="Employer ID number") 20 | tax_period = models.IntegerField(blank=False, null=False, default=0, help_text="Month filed, YYYYMM") 21 | sub_date = models.CharField(max_length=22, blank=False, null=False, default="", help_text="Submitted date in " 22 | "YYYY-MM-DD format. But submitted to whom?") 23 | taxpayer_name = models.CharField(max_length=100, blank=False, null=False, default="", help_text="Organization name") 24 | return_type = models.CharField(max_length=5, blank=False, null=False, default="", help_text="Return type") 25 | dln = models.CharField(max_length=14, blank=False, null=False, default="", help_text="Document Locator Number") 26 | object_id = models.CharField(max_length=18, blank=False, null=False, default="", help_text="IRS-assigned unique ID") 27 | 28 | # fields we set after processing 29 | schema_version = models.TextField(null=True, help_text="schema version as it appears, e.g. 2015v2.1 ") 30 | tax_year = models.IntegerField(blank=True, null=True, help_text="The year of the tax period, set this from " 31 | "tax_period") 32 | 33 | # Processing notes 34 | parse_started = models.NullBooleanField(help_text="Set this true when parsing begins") 35 | parse_complete = models.NullBooleanField(null=True, help_text="Set true when data stored") 36 | process_time = models.DateTimeField(null=True, help_text="When was parsing complete?") 37 | is_error = models.NullBooleanField(help_text="Was an error of any type encountered during parsing") 38 | key_error_count = models.IntegerField(blank=True, null=True, help_text="Number of key errors found") 39 | error_details = models.TextField(null=True, help_text="Describe error condition") 40 | 41 | def get_aws_URL(self): 42 | return "https://s3.amazonaws.com/irs-form-990/%s_public.xml" % self.object_id 43 | 44 | def get_local_URL(self): 45 | return os.path.join(XML_DIR, "%s_public.xml" % self.object_id) 46 | 47 | def set_schema_version(self): 48 | """ 49 | Sets the schema version by trying to read top of file locally. 50 | Efficient b/c it doesn't parse xml, just runs regex on files second line. 51 | Doesn't set if file is missing. 52 | """ 53 | filepath = self.get_local_URL() 54 | try: 55 | infile = open(filepath, "r") 56 | except FileNotFoundError: 57 | print("File %s is missing, quitting" % filepath) 58 | return False 59 | top = infile.read(1024) 60 | infile.close() 61 | returnline = top.split("\n")[1] 62 | result = VERSION_RE.search(returnline) 63 | if result: 64 | if result != self.schema_version: 65 | self.schema_version = result.group(1) 66 | self.save() 67 | else: 68 | print("No result in object_id: %s returnline:%s" % (self.object_id, returnline)) 69 | 70 | 71 | class Meta: 72 | managed = True 73 | indexes = [ 74 | models.Index(fields=['object_id']), 75 | ] 76 | -------------------------------------------------------------------------------- /irsdb/filing/management/commands/enter_yearly_submissions.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import requests 4 | 5 | from django.core.management.base import BaseCommand 6 | from filing.models import Filing 7 | from django.conf import settings 8 | from irsx.settings import INDEX_DIRECTORY 9 | from irsx.file_utils import stream_download 10 | 11 | BATCH_SIZE = 10000 12 | 13 | 14 | class Command(BaseCommand): 15 | help = ''' 16 | Read the yearly csv file line by line and add new lines if 17 | they don't exist. Lines are added in bulk at the end. 18 | ''' 19 | 20 | def add_arguments(self, parser): 21 | # Positional arguments 22 | parser.add_argument('year', nargs='+', type=int) 23 | 24 | def handle(self, *args, **options): 25 | for year in options['year']: 26 | local_file_path = os.path.join(INDEX_DIRECTORY, "index_%s.csv" % year) 27 | 28 | 29 | print("Entering xml submissions from %s" % local_file_path) 30 | fh = open(local_file_path, 'r') 31 | reader = csv.reader(fh) 32 | rows_to_enter = [] 33 | 34 | # ignore header rows 35 | 36 | # python 2 idiom: headers = reader.next() <--- but this is a django 2 thing, so no python 2.X 37 | next(reader) 38 | count = 0 39 | for line in reader: 40 | try: 41 | # sometimes there's an empty extra column, ignore it 42 | # RETURN_ID,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID 43 | #(return_id, ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) = line[0:8] 44 | 45 | ## for newer style index forms 2020 and on, perhaps 46 | (return_id, filing_type,ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) = line[0:9] 47 | # RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID 48 | #print(return_id, ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) 49 | 50 | ### select tax_period, parse_complete, count(*) from filing_filing where parse_started is null group by 1,2 order by 1,2; 51 | 52 | ### delete from filing_filing where parse_complete is null and tax_period like '2020%'; 53 | 54 | ### select tax_period, parse_complete, count(*) from filing_filing where parse_complete is null and tax_period like '2020%' group by 1,2 order by 1,2; 55 | except ValueError as err: 56 | print("Error with line: {line}".format(line=line)) 57 | if year == 2014: 58 | print('Did you fix the 2014 index file? See the README for instructions.') 59 | raise 60 | 61 | try: 62 | obj = Filing.objects.get(object_id=object_id) 63 | except Filing.DoesNotExist: 64 | new_sub = Filing( 65 | return_id=return_id, 66 | submission_year=year, 67 | ein=ein, 68 | tax_period=tax_period, 69 | sub_date=sub_date, 70 | taxpayer_name=taxpayer_name, 71 | return_type=return_type, 72 | dln=dln, 73 | object_id=object_id 74 | ) 75 | 76 | rows_to_enter.append(new_sub) 77 | count += 1 78 | 79 | if count % BATCH_SIZE == 0 and count > 0: 80 | print("Committing %s total entered=%s" % (BATCH_SIZE, count)) 81 | Filing.objects.bulk_create(rows_to_enter) 82 | print("commit complete") 83 | rows_to_enter = [] 84 | 85 | Filing.objects.bulk_create(rows_to_enter) 86 | print("Added %s new entries." % count) 87 | -------------------------------------------------------------------------------- /irsdb/metadata/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Base for import of metadata csv files 4 | class IRSxBase(models.Model): 5 | parent_sked = models.CharField(max_length=63, blank=True, null=True, help_text="Schedule name", editable=False) 6 | parent_sked_part = models.CharField(max_length=63, blank=True, null=True, help_text="db compliant name; NA for ScheduleParts") 7 | ordering = models.FloatField(null=True, blank=True, help_text="sort order of parts") 8 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) 9 | 10 | class Meta: 11 | abstract = True 12 | 13 | class Variable(IRSxBase): 14 | in_a_group = models.BooleanField(help_text="is this variable in a group", default=False) 15 | db_table = models.CharField(max_length=63, blank=True, null=True, help_text="db table", editable=False) 16 | db_name = models.CharField(max_length=63, blank=True, null=True, help_text="db name", editable=False) 17 | irs_type = models.CharField(max_length=63, blank=True, null=True, help_text="IRS Type", editable=False) 18 | db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False) 19 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False) 20 | description = models.TextField(help_text="IRS-supplied description, from .xsd. ") 21 | version_start = models.TextField(help_text="Start year", null=True) 22 | version_end = models.TextField(help_text="End year", null=True) 23 | is_canonical = models.NullBooleanField(help_text="", default=False) 24 | canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False) 25 | 26 | def get_absolute_url(self): 27 | return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name)) 28 | 29 | class Group(IRSxBase): 30 | db_name = models.CharField(max_length=63, blank=True, null=True, help_text="DB name", editable=False) 31 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) 32 | description = models.TextField(help_text="IRS-supplied description, from .xsd. ") 33 | headless = models.NullBooleanField(help_text="", default=False) 34 | version_start = models.TextField(help_text="Start year", null=True) 35 | version_end = models.TextField(help_text="End year", null=True) 36 | 37 | def get_absolute_url(self): 38 | return ("/metadata/groups/%s.html" % self.db_name) 39 | 40 | class SchedulePart(IRSxBase): 41 | part_name = models.CharField(max_length=255, blank=True, null=True, help_text="Part Name.", editable=False) 42 | xml_root = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? 43 | is_shell = models.NullBooleanField(help_text="", default=False) 44 | 45 | def get_absolute_url(self): 46 | return ("/metadata/parts/%s.html" % self.parent_sked_part) 47 | 48 | 49 | # For historic reference to precise line_numbers, descriptions 50 | 51 | class LineNumber(models.Model): 52 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? 53 | version_start = models.TextField(help_text="Start year", null=True) 54 | version_end = models.TextField(help_text="End year", null=True) 55 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False) 56 | 57 | class Description(models.Model): 58 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath? 59 | version_start = models.TextField(help_text="Start year", null=True) 60 | version_end = models.TextField(help_text="End year", null=True) 61 | description = models.TextField(help_text="description") 62 | -------------------------------------------------------------------------------- /irsdb/schemas/model_accumulator.py: -------------------------------------------------------------------------------- 1 | from django.apps import apps 2 | from django.forms import model_to_dict 3 | 4 | # Setting too big will create memory problems 5 | BATCH_SIZE = 100 6 | VERBOSE = False 7 | 8 | # TODO: allow appname to be passed as an argument. 9 | APPNAME = 'return' 10 | 11 | listtype = type([]) 12 | 13 | class Accumulator(object): 14 | 15 | def __init__(self): 16 | self.model_dict = {} 17 | self.model_cache = {} 18 | # Expected: 19 | # self.model_dict{model_name: [modeldictionary1, modeldictionary2,]...} 20 | 21 | def _clean_restricted(self, dict): 22 | """ RESTRICTED is only sked b, SSN's appear in a variety of places 23 | we could do a better job of restricting this 24 | """ 25 | for key in dict.keys(): 26 | if type(dict[key])==listtype: 27 | print("\n\n***list found %s" % (key)) 28 | 29 | 30 | # IRS will replace anything they think is a SSN with "XXX-XX-XXXX" 31 | # this seems to include 9 digit numbers. 32 | # The result is that the irs can lengthen fields (breaking max_length) 33 | # by doing this, so use a formulation that's shorter than this. 34 | if dict[key]: 35 | dict[key]=dict[key].replace('XXX-XX-XXXX', '-SSN-') 36 | 37 | if dict[key]=='RESTRICTED': 38 | # These are numeric fields, don't try to save 'RESTRICTED' 39 | dict[key]=0 40 | 41 | 42 | def _get_model(self, model_name, appname='return'): 43 | # cache locally so django doesn't try to hit the db every time 44 | try: 45 | return self.model_cache[appname + model_name] 46 | except KeyError: 47 | self.model_cache[model_name] = apps.get_model(appname, model_name) 48 | return self.model_cache[model_name] 49 | 50 | def commit_by_key(self, model_name): 51 | if self.model_dict[model_name]: 52 | this_model = self._get_model(model_name) 53 | if (VERBOSE): 54 | print("Committing %s objects for key %s" % ( 55 | len(self.model_dict[model_name]), 56 | model_name 57 | ) 58 | ) 59 | this_model.objects.bulk_create(self.model_dict[model_name]) 60 | 61 | # set array to empty 62 | self.model_dict[model_name] = [] 63 | 64 | def add_model(self, model_name, model_dict): 65 | # An artifact upstream is creating empty rows, with no name and only an ein and object_id 66 | # This is probably related to the 'empty head' rows in variables.csv, which will be excised by loading this db and analyzing 67 | if not model_name: 68 | print("###Model name is missing in object_id %s\ndict=%s!!!" % (model_dict['object_id'], model_dict)) 69 | return 70 | this_model = self._get_model(model_name) 71 | self._clean_restricted(model_dict) 72 | model_instance = this_model(**model_dict) 73 | try: 74 | self.model_dict[model_name].append(model_instance) 75 | 76 | except KeyError: 77 | self.model_dict[model_name]= [model_instance] 78 | 79 | if len(self.model_dict[model_name]) >= BATCH_SIZE: 80 | self.commit_by_key(model_name) 81 | 82 | def commit_all(self): 83 | # commit everything 84 | if (VERBOSE): 85 | print("Running commit all! ") 86 | print(self.object_report()) 87 | for thiskey in self.model_dict.keys(): 88 | if (VERBOSE): 89 | print("Commit key %s" % thiskey) 90 | self.commit_by_key(thiskey) 91 | 92 | def count(self, model_name): 93 | return len(self.model_dict[model_name]) 94 | 95 | def object_report(self): 96 | total = 0 97 | for i in self.model_dict.keys(): 98 | thislen = self.count(i) 99 | total += thislen 100 | if thislen > 0: 101 | print("\t%s:%s" % (i, thislen)) 102 | print("Total %s objects" % total) 103 | -------------------------------------------------------------------------------- /irsdb/irsdb/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for irsdb project. 3 | 4 | Generated by 'django-admin startproject' using Django 2.0.1. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/2.0/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '-9eh@&_nb_nvklo6fhck5&bts#+*@hubea7+!65kwr(r4%9$8=' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'metadata', 41 | 'filing', 42 | 'return', 43 | 'schemas', 44 | 45 | ] 46 | 47 | MIDDLEWARE = [ 48 | 'django.middleware.security.SecurityMiddleware', 49 | 'django.contrib.sessions.middleware.SessionMiddleware', 50 | 'django.middleware.common.CommonMiddleware', 51 | 'django.middleware.csrf.CsrfViewMiddleware', 52 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 53 | 'django.contrib.messages.middleware.MessageMiddleware', 54 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 55 | ] 56 | 57 | ROOT_URLCONF = 'irsdb.urls' 58 | 59 | # TEMPLATES = [ 60 | # { 61 | # 'BACKEND': 'django.template.backends.django.DjangoTemplates', 62 | # 'DIRS': [], 63 | # 'APP_DIRS': False, 64 | # 'OPTIONS': { 65 | # 'context_processors': [ 66 | # 'django.template.context_processors.debug', 67 | # 'django.template.context_processors.request', 68 | # 'django.contrib.auth.context_processors.auth', 69 | # 'django.contrib.messages.context_processors.messages', 70 | # ], 71 | # }, 72 | # }, 73 | # ] 74 | 75 | WSGI_APPLICATION = 'irsdb.wsgi.application' 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/2.0/ref/settings/#databases 80 | 81 | # DATABASES = { 82 | # 'default': { 83 | # 'ENGINE': 'django.db.backends.sqlite3', 84 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 85 | # } 86 | # } 87 | 88 | 89 | # Password validation 90 | # https://docs.djangoproject.com/en/2.0/ref/settings/#auth-password-validators 91 | 92 | AUTH_PASSWORD_VALIDATORS = [ 93 | { 94 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 95 | }, 96 | { 97 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 98 | }, 99 | { 100 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 101 | }, 102 | { 103 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 104 | }, 105 | ] 106 | 107 | 108 | # Internationalization 109 | # https://docs.djangoproject.com/en/2.0/topics/i18n/ 110 | 111 | LANGUAGE_CODE = 'en-us' 112 | 113 | TIME_ZONE = 'UTC' 114 | 115 | USE_I18N = True 116 | 117 | USE_L10N = True 118 | 119 | USE_TZ = True 120 | 121 | 122 | # Static files (CSS, JavaScript, Images) 123 | # https://docs.djangoproject.com/en/2.0/howto/static-files/ 124 | 125 | STATIC_URL = '/static/' 126 | 127 | STATICFILES_DIRS = [ 128 | os.path.join(BASE_DIR, "static") 129 | ] 130 | 131 | # will break without irsx 132 | from irsx.settings import METADATA_DIRECTORY, KNOWN_SCHEDULES 133 | 134 | GENERATED_MODELS_DIR = os.path.join(BASE_DIR, "generated_schemas") 135 | 136 | ## suppress verbose complaints by uncommenting 137 | #import warnings 138 | #warnings.simplefilter(action='ignore', category=RuntimeWarning) 139 | 140 | 141 | 142 | try: 143 | from .local_settings import * 144 | except ImportError as e: 145 | print("Error importing local_settings.py\n%s" % e) 146 | -------------------------------------------------------------------------------- /irsdb/metadata/management/commands/load_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from django.core.management.base import BaseCommand 4 | from metadata.models import * 5 | 6 | from django.conf import settings 7 | 8 | #METADATA_DIRECTORY = settings.METADATA_DIRECTORY 9 | METADATA_DIRECTORY = settings.GENERATED_MODELS_DIR 10 | REPORT_COUNT = 100 11 | CANONICAL_VERSION = '2016v3.0' 12 | 13 | class Command(BaseCommand): 14 | help = """ 15 | Erase and reload the metadata tables, one at a time. 16 | """ 17 | 18 | def read_blacklist(self): 19 | # read the list of blacklisted xpaths, return a hash 20 | infilepath = os.path.join(METADATA_DIRECTORY, "emptyhead_blacklist.txt") 21 | infile = open(infilepath, 'r') 22 | for row in infile: 23 | xpath = row.replace("\n", "") 24 | print("blacklisting xpath '%s' " % xpath) 25 | self.blacklist[xpath] = True 26 | 27 | def reload_variables(self, *args, **options): 28 | print("Deleting variables.") 29 | Variable.objects.all().delete() 30 | infilepath = os.path.join(METADATA_DIRECTORY, "variables.csv") 31 | infile = open(infilepath, 'r') 32 | reader = csv.DictReader(infile) 33 | for i, row in enumerate(reader): 34 | 35 | try: 36 | self.blacklist[row['xpath']] 37 | # it's blacklisted, so ignore 38 | print("ignoring blacklisted xpath %s" % row['xpath']) 39 | except KeyError: 40 | # not blacklisted, so create it 41 | 42 | Variable.objects.create(**row) 43 | print("Total Variables %s" % i) 44 | 45 | def reload_groups(self, *args, **options): 46 | print("Deleting Groups.") 47 | Group.objects.all().delete() 48 | infilepath = os.path.join(METADATA_DIRECTORY, "groups.csv") 49 | infile = open(infilepath, 'r') 50 | reader = csv.DictReader(infile) 51 | for i, row in enumerate(reader): 52 | try: 53 | if row['headless'] == '': 54 | row['headless'] = None 55 | except KeyError: 56 | pass 57 | if i%REPORT_COUNT == 0: 58 | print("Created %s rows" % i) 59 | Group.objects.create(**row) 60 | print("Total Groups %s" % i) 61 | 62 | def reload_schedule_parts(self, *args, **options): 63 | print("Deleting ScheduleParts.") 64 | SchedulePart.objects.all().delete() 65 | infilepath = os.path.join(METADATA_DIRECTORY, "schedule_parts.csv") 66 | infile = open(infilepath, 'r') 67 | reader = csv.DictReader(infile) 68 | for i, row in enumerate(reader): 69 | try: 70 | if row['is_shell'] == '': 71 | row['is_shell'] = None 72 | except KeyError: 73 | pass 74 | if i%REPORT_COUNT == 0: 75 | print("Created %s rows" % i) 76 | SchedulePart.objects.create(**row) 77 | print("Total Schedule Parts %s" % i) 78 | 79 | def reload_line_numbers(self, *args, **options): 80 | print("Deleting LineNumbers.") 81 | LineNumber.objects.all().delete() 82 | infilepath = os.path.join(METADATA_DIRECTORY, "line_numbers.csv") 83 | infile = open(infilepath, 'r') 84 | reader = csv.DictReader(infile) 85 | for i, row in enumerate(reader): 86 | if i%REPORT_COUNT == 0: 87 | print("Created %s rows" % i) 88 | 89 | try: 90 | self.blacklist[row['xpath']] 91 | # it's blacklisted, so ignore 92 | print("ignoring blacklisted xpath line number... %s" % row['xpath']) 93 | except KeyError: 94 | # not blacklisted, so create it 95 | LineNumber.objects.create(**row) 96 | print("Total LineNumber created %s" % i) 97 | 98 | def reload_descriptions(self, *args, **options): 99 | print("Deleting Descriptions.") 100 | Description.objects.all().delete() 101 | infilepath = os.path.join(METADATA_DIRECTORY, "descriptions.csv") 102 | infile = open(infilepath, 'r') 103 | reader = csv.DictReader(infile) 104 | for i, row in enumerate(reader): 105 | if i%REPORT_COUNT == 0: 106 | print("Created %s rows" % i) 107 | 108 | try: 109 | self.blacklist[row['xpath']] 110 | # it's blacklisted, so ignore 111 | print("ignoring blacklisted xpath description... %s" % row['xpath']) 112 | except KeyError: 113 | # not blacklisted, so create it 114 | Description.objects.create(**row) 115 | print("Total Description created %s" % i) 116 | 117 | def handle(self, *args, **options): 118 | print("Running metadata load on variables.") 119 | self.blacklist = {} 120 | 121 | self.read_blacklist() 122 | self.reload_variables() 123 | self.reload_groups() 124 | self.reload_schedule_parts() 125 | self.reload_line_numbers() 126 | self.reload_descriptions() -------------------------------------------------------------------------------- /irsdb/return/management/commands/load_filings.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import requests 4 | 5 | from datetime import datetime 6 | 7 | from django.core.management.base import BaseCommand 8 | from django.conf import settings 9 | 10 | from filing.models import Filing 11 | from schemas.model_accumulator import Accumulator 12 | from irsx.settings import INDEX_DIRECTORY 13 | from irsx.file_utils import stream_download 14 | from irsx.xmlrunner import XMLRunner 15 | from irsx.filing import FileMissingException 16 | 17 | 18 | # this is how many we process; there's a separate batch size 19 | # in model accumulator for how many are processed 20 | BATCH_SIZE = 1000 21 | 22 | 23 | class Command(BaseCommand): 24 | help = ''' 25 | Enter the filings, one by one. 26 | Loading is done in bulk, though status on the filings is updated one at a time. 27 | 28 | ''' 29 | 30 | def add_arguments(self, parser): 31 | # Positional arguments 32 | parser.add_argument('year', nargs=1, type=int) 33 | 34 | def setup(self): 35 | # get an XMLRunner -- this is what actually does the parsing 36 | self.xml_runner = XMLRunner() 37 | self.accumulator = Accumulator() 38 | 39 | 40 | def process_sked(self, sked): 41 | """ Enter just one schedule """ 42 | #print("Processing schedule %s" % sked['schedule_name']) 43 | for part in sked['schedule_parts'].keys(): 44 | partname = part 45 | partdata = sked['schedule_parts'][part] 46 | #print("part %s %s" % (partname, partdata)) 47 | 48 | self.accumulator.add_model(partname, partdata) 49 | 50 | for groupname in sked['groups'].keys(): 51 | for groupdata in sked['groups'][groupname]: 52 | #print("group %s %s" % (groupname, groupdata) ) 53 | self.accumulator.add_model(groupname, groupdata) 54 | 55 | 56 | def run_filing(self, filing): 57 | object_id = filing.object_id 58 | 59 | 60 | parsed_filing = self.xml_runner.run_filing(object_id) 61 | 62 | if not parsed_filing: 63 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) 64 | return None 65 | 66 | schedule_list = parsed_filing.list_schedules() 67 | #print("sked list is %s" % schedule_list) 68 | 69 | result = parsed_filing.get_result() 70 | 71 | keyerrors = parsed_filing.get_keyerrors() 72 | schema_version = parsed_filing.get_version() 73 | ## This could be disabled if we don't care about the schema version 74 | ## This is one save per loaded row... 75 | if filing.schema_version != schema_version: 76 | filing.schema_version = schema_version 77 | filing.save() 78 | 79 | if keyerrors: 80 | # If we find keyerrors--xpaths that are missing from our spec, note it 81 | print("Key error %s") 82 | has_keyerrors = len(keyerrors) > 0 83 | print("keyerror: %s" % keyerrors) 84 | filing.error_details = str(keyerrors) 85 | filing.key_error_count = len(keyerrors) 86 | filing.is_error = has_keyerrors 87 | filing.save() 88 | 89 | if result: 90 | for sked in result: 91 | self.process_sked(sked) 92 | else: 93 | print("Filing not parsed %s " % object_id) 94 | 95 | 96 | def handle(self, *args, **options): 97 | 98 | year = int(options['year'][0]) 99 | if year not in [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]: 100 | raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2025" % year) 101 | 102 | print("Running filings during year %s" % year) 103 | self.setup() 104 | 105 | process_count = 0 106 | missing_filings = 0 107 | missed_file_list = [] 108 | 109 | while True: 110 | filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100] 111 | if not filings: 112 | print("Done") 113 | break 114 | 115 | object_id_list = [f.object_id for f in filings] 116 | 117 | # record that processing has begun 118 | Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True) 119 | 120 | 121 | for filing in filings: 122 | #print("Handling id %s" % filing.object_id) 123 | try: 124 | self.run_filing(filing) 125 | except FileMissingException: 126 | print("File missing %s, skipping" % filing.object_id) 127 | missing_filings += 1 128 | missed_file_list.append(filing.object_id) 129 | process_count += 1 130 | if process_count % 1000 == 0: 131 | print("Handled %s filings" % process_count) 132 | 133 | # commit anything that's left 134 | self.accumulator.commit_all() 135 | # record that all are complete 136 | Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True) 137 | print("Processed a total of %s filings" % process_count) 138 | print("Total missing files: %s" % missing_filings) 139 | print("Missing %s" % missed_file_list) 140 | -------------------------------------------------------------------------------- /irsdb/schemas/type_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hand curated list of types that occur in IRS fillings, and their representations 3 | in django and sqlalchemy. The latter is pretty rough. 4 | 5 | Defaults to a text field when definitions are missing. 6 | 7 | In general the IRS adapts type definitions to shoehorn in old data, 8 | so using current definitions is usually good enough for older stuff... to a point. 9 | 10 | This list is tied to 2013 forwards, maybe this should be namespaced or linked somehow. 11 | 12 | USAmountType allows 15 digit ints, so should probably be mapped to biginteger or dealt with. 13 | """ 14 | 15 | # A char field longer than MAX_CHAR_FIELD_SIZE will be a text field. 16 | # Best setting may be db dependent? 17 | MAX_CHAR_FIELD_SIZE = 200 18 | 19 | # based on 2015/2016 schemas, unclear if pre 2013 stuff will break this. 20 | var_types = { 21 | 'USAmountType':{'type':'Integer', 'length':15}, 22 | 'BooleanType':{'type':'Char', 'length':5}, # http://www.datypic.com/sc/xsd/t-xsd_boolean.html, legal values = [0,1,'true', 'false'] 23 | 'USAmountNNType':{'type':'Integer', 'length':15}, 24 | 'CheckboxType':{'type':'Char', 'length':1}, # legal values = ['X'] 25 | 'StreetAddressType':{'type':'Char', 'length':35}, 26 | 'LineExplanationType':{'type':'Char', 'length':100}, 27 | 'IntegerNNType':{'type':'Integer', 'length':15}, # max value is unclear, see xsd:Integer vs xsd:int 28 | 'BusinessNameLine2Type':{'type':'Char', 'length':75}, 29 | 'BusinessNameLine1Type':{'type':'Char', 'length':75}, 30 | 'RatioType':{'type':'Decimal', 'totalDigits':6, 'fractionDigits':5}, 31 | 'StateType':{'type':'Char', 'length':2}, # need list key to translate back ? 32 | 'CountryType':{'type':'Char', 'length':2}, 33 | 'ShortExplanationType':{'type':'Text', 'length':1000}, 34 | 'CityType':{'type':'Char', 'length':22}, 35 | 'ZIPCodeType':{'type':'Char', 'length':15}, # "ZIP Code - 5 digits plus optional 4 or 7 digits" 36 | 'PersonNameType':{'type':'Char', 'length':35}, 37 | 'ExplanationType':{'type':'Text', 'length':9000}, 38 | 'YearType':{'type':'Integer', 'length':4}, 39 | 'EINType':{'type':'Char', 'length':9}, 40 | 'DateType':{'type':'Char', 'length':31}, # unclear http://www.datypic.com/sc/xsd/s-datatypes.xsd.html 41 | 'ShortDescriptionType':{'type':'Char', 'length':20}, 42 | 'CountType':{'type':'Integer', 'length':7}, # max length is 6 43 | 'Count2Type':{'type':'Integer', 'length':7}, # max length is 6 44 | 'PhoneNumberType':{'type':'Char', 'length':10}, 45 | 'IRS990PFPartVDistriRatioType':{'type':'Decimal', 'totalDigits':12, 'fractionDigits':6}, # was 9, manual fix 46 | 'LargeRatioType':{'type':'Decimal', 'totalDigits':22, 'fractionDigits':12}, 47 | 'DecimalNNType':{'type':'Decimal', 'totalDigits':22, 'fractionDigits':2}, # dunno upper bound 48 | 'EFINType':{'type':'Char', 'length':6}, # Type for Electronic Filing Identification No. - 6 digits 49 | 'PINType':{'type':'Char', 'length':5}, # Type for Practitioner PIN, Self-Select PIN and Third Party Designee PIN 50 | 'EmailAddressType':{'type':'Char', 'length':75}, 51 | 'SoftwareVersionType':{'type':'Char', 'length':20}, 52 | 'TimeType':{'type':'Char', 'length':15}, # Should be no more than 9 chars, but... [0-9]{2}:[0-9]{2}:[0-9]{2} 53 | 'CUSIPNumberType':{'type':'Char', 'length':9}, 54 | 'SSNType':{'type':'Char', 'length':12}, # should be 9 but needs to fit "XXX-XX-XXXX" which has 11 55 | 'DeviceIdType':{'type':'Char', 'length':40}, 56 | 'BusinessNameControlType':{'type':'Char', 'length':7}, # max is 4: ([A-Z0-9\-]|&){1,4} 57 | 'PersonTitleType':{'type':'Char', 'length':35}, 58 | 'OriginatorType':{'type':'Char', 'length':15}, 59 | 'TimestampType':{'type':'Char', 'length':63}, # not sure 60 | 'ISPType':{'type':'Char', 'length':6}, # Type for Intermediate Service Provider No. - 6 uppercase alphanumeric characters 61 | 'PTINType':{'type':'Char', 'length':9}, # Type for Preparer Personal Identification No. - P followed by 8 digits 62 | 'USAmountNegType':{'type':'Integer', 'length':15}, 63 | 'IPv6Type':{'type':'Char', 'length':31}, # 1 64 | 'SoftwareIdType':{'type':'Char', 'length':8}, #The Software ID - 8 digits 65 | 'IPv4Type':{'type':'Char', 'length':31}, 66 | 'InCareOfNameType':{'type':'Char', 'length':35}, 67 | 'TimezoneType':{'type':'Char', 'length':31} 68 | } 69 | 70 | def get_django_type(vartype): 71 | 72 | try: 73 | thisvar = var_types[vartype] 74 | except KeyError: 75 | return "TextField(null=True, blank=True)" 76 | 77 | if (thisvar['type'] =='Integer'): 78 | if thisvar['length'] < 10: 79 | return "IntegerField(null=True, blank=True)" # 32 bit: Values from -2147483648 to 2147483647 80 | else: 81 | return "BigIntegerField(null=True, blank=True)" # 64 bit: from: -9223372036854775808 to 9223372036854775807 82 | 83 | elif (thisvar['type']=='Decimal'): 84 | return "DecimalField(null=True, blank=True, max_digits=%s, decimal_places=%s)" % (thisvar['totalDigits'], thisvar['fractionDigits']) 85 | 86 | elif (thisvar['type']=='Char'): 87 | if thisvar['length'] <= MAX_CHAR_FIELD_SIZE: 88 | return "CharField(null=True, blank=True, max_length=%s)" % thisvar['length'] 89 | else: 90 | return "TextField(null=True, blank=True)" 91 | 92 | elif (thisvar['type']=='Text'): 93 | return "TextField(null=True, blank=True)" 94 | 95 | else: 96 | print("** No match for %s " % thisvar) 97 | return "TextField(null=True, blank=True)" 98 | 99 | def get_sqlalchemy_type(vartype): 100 | 101 | """ This is really rough, not tested, may change """ 102 | 103 | try: 104 | thisvar = var_types[vartype] 105 | except KeyError: 106 | return "Text" 107 | 108 | if (thisvar['type'] =='Integer'): 109 | if thisvar['length'] < 10: # 64 bit should be forced to BigInteger 110 | return "Integer" 111 | else: 112 | return "BigInteger" 113 | 114 | elif (thisvar['type']=='Decimal'): 115 | return "Numeric(precision=%s)" % (thisvar['totalDigits']) 116 | 117 | elif (thisvar['type']=='Char'): 118 | if thisvar['length'] <= MAX_CHAR_FIELD_SIZE: 119 | return "String(length=%s)" % thisvar['length'] 120 | else: 121 | return "Text" 122 | 123 | elif (thisvar['type']=='Text'): 124 | return "Text" 125 | 126 | else: 127 | print("** No match for %s " % thisvar) 128 | return "Text" 129 | 130 | 131 | 132 | 133 | 134 | 135 | if __name__ == "__main__": 136 | for key in var_types.keys(): 137 | print("key %s resolve to '%s' and '%s'" % (key, get_django_type(key), get_sqlalchemy_type(key) ) ) 138 | 139 | -------------------------------------------------------------------------------- /irsdb/stream_extractor.py: -------------------------------------------------------------------------------- 1 | import unicodecsv as csv 2 | from irsx.xmlrunner import XMLRunner 3 | 4 | 5 | class StreamExtractor(object): 6 | """Write filings to csv, specified in config.py""" 7 | 8 | def __init__(self, output_streams, data_capture_dict): 9 | self.output_streams = output_streams 10 | self.data_capture_dict = data_capture_dict 11 | self.xml_runner = XMLRunner() 12 | self._init_streams() 13 | 14 | 15 | def _init_streams(self): 16 | for stream_key in self.output_streams.keys(): 17 | this_stream = self.output_streams[stream_key] 18 | filename = this_stream['filename'] + ".csv" 19 | print("Initializing output stream %s" % filename) 20 | outfile = open(filename , 'wb') 21 | dw = csv.DictWriter(outfile, this_stream['headers'], extrasaction='ignore') 22 | dw.writeheader() 23 | self.output_streams[stream_key]['writer'] = dw 24 | 25 | 26 | 27 | def run_parts(self, this_capture_sked, parsed_sked, sked, taxpayer_name=""): 28 | #print("run parts %s \n %s " % (this_capture_sked, parsed_sked) ) 29 | for part_key in this_capture_sked['parts'].keys(): 30 | stream_key = this_capture_sked['parts'][part_key]['stream_key'] 31 | this_stream = self.output_streams[stream_key] 32 | part = None 33 | try: 34 | part = parsed_sked['schedule_parts'][part_key] 35 | except KeyError: 36 | continue 37 | 38 | capture_dict = this_capture_sked['parts'][part_key] 39 | 40 | row_data = {} 41 | row_data['form'] = sked 42 | row_data['source'] = part_key 43 | row_data['taxpayer_name'] = taxpayer_name 44 | 45 | 46 | for capture_key in capture_dict.keys(): 47 | if capture_key == 'stream_key': 48 | continue 49 | try: 50 | val = part[capture_key] 51 | csv_header = capture_dict[capture_key]['header'] 52 | row_data[csv_header] = val 53 | 54 | except KeyError: 55 | try: 56 | default = capture_dict[capture_key]['default'] 57 | csv_header = capture_dict[capture_key]['header'] 58 | row_data[csv_header]=default 59 | except KeyError: 60 | #print("Key Error %s" % capture_key) 61 | pass 62 | 63 | ## Composite keys: Not implemented here. 64 | 65 | #print("row data is %s" % row_data) 66 | ## We've gone through who whole part -- write it to file 67 | this_stream['writer'].writerow(row_data) 68 | 69 | 70 | 71 | def run_groups(self, this_capture_sked, parsed_sked, sked, taxpayer_name=""): 72 | for group_key in this_capture_sked['groups'].keys(): 73 | stream_key = this_capture_sked['groups'][group_key]['stream_key'] 74 | this_stream = self.output_streams[stream_key] 75 | groups = None 76 | try: 77 | groups = parsed_sked['groups'][group_key] 78 | except KeyError: 79 | #print("No groups found for %s\n" % group_key) 80 | continue 81 | 82 | for group in groups: 83 | capture_dict = this_capture_sked['groups'][group_key] 84 | row_data = {} 85 | row_data['form'] = sked 86 | row_data['source'] = group_key 87 | row_data['taxpayer_name'] = taxpayer_name 88 | 89 | for capture_key in capture_dict.keys(): 90 | if capture_key == 'stream_key': 91 | continue 92 | try: 93 | val = group[capture_key] 94 | csv_header = capture_dict[capture_key]['header'] 95 | row_data[csv_header] = val 96 | 97 | except KeyError: 98 | try: 99 | default = capture_dict[capture_key]['default'] 100 | csv_header = capture_dict[capture_key]['header'] 101 | row_data[csv_header]=default 102 | except KeyError: 103 | pass 104 | 105 | ## now look for "composite keys" 106 | composite_groups = None 107 | try: 108 | composite_groups = capture_dict['composite'] 109 | except KeyError: 110 | pass 111 | 112 | # composite groups are summed up from existing vars, and need a default 113 | if composite_groups: 114 | for composite_group_key in composite_groups.keys(): 115 | total = 0 116 | for cg_part in composite_groups[composite_group_key].keys(): 117 | try: 118 | val = group[cg_part] 119 | total += int(val) 120 | except KeyError: 121 | total += composite_groups[composite_group_key][cg_part]['default'] 122 | row_data[composite_group_key] = total 123 | 124 | ## We've gone through who whole group -- write it to file 125 | this_stream['writer'].writerow(row_data) 126 | 127 | def run_filing(self, filing, taxpayer_name=""): 128 | 129 | parsed_filing = self.xml_runner.run_filing(filing) 130 | schedule_list = parsed_filing.list_schedules() 131 | 132 | if ( int(parsed_filing.get_version()[:4]) < 2013 ): 133 | print("Skipping pre-2013 schemas") 134 | return None 135 | 136 | for sked in self.data_capture_dict.keys(): 137 | if sked in schedule_list: 138 | #print ("Running sked %s" % sked) 139 | parsed_skeds = parsed_filing.get_parsed_sked(sked) 140 | if parsed_skeds: 141 | parsed_sked = parsed_skeds[0] 142 | else: 143 | continue 144 | 145 | this_capture_sked = self.data_capture_dict[sked] 146 | 147 | 148 | ### Repeating Groups 149 | skip_groups = False 150 | try: 151 | this_capture_sked['groups'] 152 | except KeyError: 153 | skip_groups = True 154 | if not skip_groups: 155 | self.run_groups(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name) 156 | 157 | 158 | ### Nonrepeating schedule parts 159 | skip_parts = False 160 | try: 161 | this_capture_sked['parts'] 162 | except KeyError: 163 | skip_parts = True 164 | if not skip_parts: 165 | self.run_parts(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name) 166 | else: 167 | #print("missing sked %s" % sked) 168 | pass 169 | 170 | 171 | -------------------------------------------------------------------------------- /grants.sh: -------------------------------------------------------------------------------- 1 | -- Assumes that directors was already run. 2 | 3 | -- Schedule I 4 | 5 | -- The schedule I variables are defined in the [irsx documentation](http://www.irsx.info/metadata/groups/SkdIRcpntTbl.html). 6 | 7 | -- Here's a query to a temp table 8 | 9 | 10 | DROP TABLE IF EXISTS grants; 11 | 12 | SELECT 13 | return_SkdIRcpntTbl.object_id as object_id, 14 | address_table."RtrnHdr_TxPrdEndDt", 15 | address_table."RtrnHdr_TxYr", 16 | address_table."BsnssOffcr_SgntrDt", 17 | address_table."BsnssNm_BsnssNmLn1Txt" as "Donor_BsnssNmLn1", 18 | address_table."BsnssNm_BsnssNmLn2Txt" as "Donor_BsnssNmLn2", 19 | address_table."BsnssOffcr_PrsnNm" as "Donor_BsnssOffcr_PrsnNm", 20 | address_table."BsnssOffcr_PrsnTtlTxt" as "Donor_BsnssOffcr_PrsnTtlTxt", 21 | address_table."BsnssOffcr_PhnNm" as "Donor_BsnssOffcr_PhnNm" , 22 | address_table."BsnssOffcr_EmlAddrssTxt" as "Donor_BsnssOffcr_EmlAddrssTxt" , 23 | address_table."USAddrss_AddrssLn1Txt" as "Donor_AddrssLn1Txt", 24 | address_table."USAddrss_AddrssLn2Txt" as "Donor_AddrssLn2Txt", 25 | address_table."USAddrss_CtyNm" as "Donor_CtyNm", 26 | address_table."USAddrss_SttAbbrvtnCd" as "Donor_SttAbbrvtnCd", 27 | address_table."USAddrss_ZIPCd" as "Donor_ZIPCd", 28 | address_table."FrgnAddrss_AddrssLn1Txt" as "Donor_FrgnAddrss_AddrssLn1Txt", 29 | address_table."FrgnAddrss_AddrssLn2Txt" as "Donor_FrgnAddrss_AddrssLn2Txt", 30 | address_table."FrgnAddrss_CtyNm" as "Donor_FrgnAddrss_CtyNm", 31 | address_table."FrgnAddrss_PrvncOrSttNm" as "Donor_PrvncOrSttNm", 32 | address_table."FrgnAddrss_CntryCd" as "Donor_CntryCd", 33 | return_SkdIRcpntTbl.ein as "Donor_EIN", 34 | '' as "RcpntPrsnNm", 35 | return_SkdIRcpntTbl."RcpntTbl_RcpntEIN" as "Rcpnt_EIN", 36 | return_SkdIRcpntTbl."RcpntBsnssNm_BsnssNmLn1Txt" as "Rcpnt_BsnssNmLn1", 37 | return_SkdIRcpntTbl."RcpntBsnssNm_BsnssNmLn2Txt" as "Rcpnt_BsnssNmLn2", 38 | trim(concat(return_SkdIRcpntTbl."USAddrss_AddrssLn1Txt", ' ', return_SkdIRcpntTbl."FrgnAddrss_AddrssLn1Txt")) as "Rcpnt_AddrssLn1", 39 | trim(concat(return_SkdIRcpntTbl."USAddrss_AddrssLn2Txt", ' ', return_SkdIRcpntTbl."FrgnAddrss_AddrssLn2Txt")) as "Rcpnt_AddrssLn2", 40 | trim(concat(return_SkdIRcpntTbl."USAddrss_CtyNm", ' ', return_SkdIRcpntTbl."FrgnAddrss_CtyNm")) as "Rcpnt_CtyNm", 41 | trim(concat(return_SkdIRcpntTbl."USAddrss_SttAbbrvtnCd", ' ', return_SkdIRcpntTbl."FrgnAddrss_PrvncOrSttNm")) as "Rcpnt_SttAbbrvtnCd", 42 | return_SkdIRcpntTbl."RcpntTbl_CshGrntAmt" as "Rcpnt_Amt", 43 | return_SkdIRcpntTbl."RcpntTbl_PrpsOfGrntTxt" as "Rcpnt_PrpsTxt", 44 | trim(concat(return_SkdIRcpntTbl."USAddrss_ZIPCd", ' ', return_SkdIRcpntTbl."FrgnAddrss_FrgnPstlCd")) as "Rcpnt_ZIPCd", 45 | '' as "Rcpnt_Rltnshp", 46 | return_SkdIRcpntTbl."RcpntTbl_IRCSctnDsc" as "Rcpnt_FndtnStts" 47 | INTO TEMPORARY TABLE grants 48 | FROM return_SkdIRcpntTbl 49 | LEFT JOIN address_table 50 | ON return_SkdIRcpntTbl.object_id = address_table.object_id 51 | AND return_SkdIRcpntTbl.ein = address_table.ein; 52 | 53 | 54 | 55 | 56 | -- Add org type data 57 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990ScheduleI' as form, grants.* into temporary table grants_types from grants left join org_types on grants.object_id = org_types.object_id and grants."Donor_EIN" = org_types.ein; 58 | 59 | -- Then copy to local with \copy: 60 | 61 | \copy grants_types to '/data/file_exports/skedigrants.csv' with csv header; 62 | 63 | 64 | 65 | 66 | -------- 67 | -- Form PF Part XV "Grant or Contribution Paid During Year" 68 | -- 69 | -- See the IRSX documentation for form PPF Part XV [Grant or Contribution Paid During Year](http://www.irsx.info/metadata/groups/PFGrntOrCntrbtnPdDrYr.html) 70 | -- 71 | -- Note that there's also a different section for grants of contributions approved for future years that we aren't using to avoid double-counting; see [the form instructions](https://www.irs.gov/instructions/i990pf#idm140486306377296) for (not much) more info. 72 | -------- 73 | 74 | 75 | DROP TABLE IF EXISTS pfgrants; 76 | 77 | SELECT 78 | return_PFGrntOrCntrbtnPdDrYr.object_id as object_id, 79 | address_table."RtrnHdr_TxPrdEndDt", 80 | address_table."RtrnHdr_TxYr", 81 | address_table."BsnssOffcr_SgntrDt", 82 | address_table."BsnssNm_BsnssNmLn1Txt" as "Donor_BsnssNmLn1", 83 | address_table."BsnssNm_BsnssNmLn2Txt" as "Donor_BsnssNmLn2", 84 | address_table."BsnssOffcr_PrsnNm" as "Donor_BsnssOffcr_PrsnNm", 85 | address_table."BsnssOffcr_PrsnTtlTxt" as "Donor_BsnssOffcr_PrsnTtlTxt", 86 | address_table."BsnssOffcr_PhnNm" as "Donor_BsnssOffcr_PhnNm" , 87 | address_table."BsnssOffcr_EmlAddrssTxt" as "Donor_BsnssOffcr_EmlAddrssTxt" , 88 | address_table."USAddrss_AddrssLn1Txt" as "Donor_AddrssLn1Txt", 89 | address_table."USAddrss_AddrssLn2Txt" as "Donor_AddrssLn2Txt", 90 | address_table."USAddrss_CtyNm" as "Donor_CtyNm", 91 | address_table."USAddrss_SttAbbrvtnCd" as "Donor_SttAbbrvtnCd", 92 | address_table."USAddrss_ZIPCd" as "Donor_ZIPCd", 93 | address_table."FrgnAddrss_AddrssLn1Txt" as "Donor_FrgnAddrss_AddrssLn1Txt", 94 | address_table."FrgnAddrss_AddrssLn2Txt" as "Donor_FrgnAddrss_AddrssLn2Txt", 95 | address_table."FrgnAddrss_CtyNm" as "Donor_FrgnAddrss_CtyNm", 96 | address_table."FrgnAddrss_PrvncOrSttNm" as "Donor_PrvncOrSttNm", 97 | address_table."FrgnAddrss_CntryCd" as "Donor_CntryCd", 98 | return_PFGrntOrCntrbtnPdDrYr.ein as "Donor_EIN", 99 | '' as "Rcpnt_EIN", 100 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntPrsnNm" as "RcpntPrsnNm", 101 | return_PFGrntOrCntrbtnPdDrYr."RcpntBsnssNm_BsnssNmLn1Txt" as "Rcpnt_BsnssNmLn1", 102 | return_PFGrntOrCntrbtnPdDrYr."RcpntBsnssNm_BsnssNmLn2Txt" as "Rcpnt_BsnssNmLn2", 103 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_AddrssLn1Txt", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_AddrssLn1Txt")) as "Rcpnt_AddrssLn1", 104 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_AddrssLn2Txt", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_AddrssLn2Txt")) as "Rcpnt_AddrssLn2", 105 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_CtyNm", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_CtyNm")) as "Rcpnt_CtyNm", 106 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_SttAbbrvtnCd", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_PrvncOrSttNm")) as "Rcpnt_SttAbbrvtnCd", 107 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_Amt" as "Rcpnt_Amt", 108 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_GrntOrCntrbtnPrpsTxt" as "Rcpnt_PrpsTxt", 109 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_ZIPCd", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_FrgnPstlCd")) as "Rcpnt_ZIPCd", 110 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntRltnshpTxt" as "Rcpnt_Rltnshp", 111 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntFndtnSttsTxt" as "Rcpnt_FndtnStts" 112 | INTO TEMPORARY TABLE pfgrants 113 | FROM return_PFGrntOrCntrbtnPdDrYr 114 | LEFT JOIN address_table ON return_PFGrntOrCntrbtnPdDrYr.object_id = address_table.object_id 115 | AND return_PFGrntOrCntrbtnPdDrYr.ein = address_table.ein; 116 | 117 | -- Add org type data 118 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990PF' as form, pfgrants.* into temporary table pfgrants_types from pfgrants left join org_types on pfgrants.object_id = org_types.object_id and pfgrants."Donor_EIN" = org_types.ein; 119 | 120 | -- Copy to local 121 | 122 | \copy pfgrants_types to '/data/file_exports/pfgrants.csv' with csv header; 123 | -------------------------------------------------------------------------------- /irsdb/return/management/commands/load_filings_multithreaded.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import requests 4 | 5 | from datetime import datetime 6 | 7 | from django.core.management.base import BaseCommand 8 | from django.conf import settings 9 | 10 | from filing.models import Filing 11 | from schemas.model_accumulator import Accumulator 12 | from irsx.settings import INDEX_DIRECTORY 13 | from irsx.file_utils import stream_download 14 | from irsx.xmlrunner import XMLRunner 15 | 16 | from queue import Queue 17 | from threading import Thread 18 | from django.db import connection 19 | 20 | # this is how many we process; there's a separate batch size 21 | # in model accumulator for how many are processed 22 | BATCH_SIZE = 1000 23 | 24 | class DownloadWorker(Thread): 25 | def add_arguments(self, parser): 26 | # Positional arguments 27 | parser.add_argument('year', nargs=1, type=int) 28 | 29 | def setup(self): 30 | # get an XMLRunner -- this is what actually does the parsing 31 | self.xml_runner = XMLRunner() 32 | self.accumulator = Accumulator() 33 | 34 | def process_sked(self, sked): 35 | """ Enter just one schedule """ 36 | #print("Processing schedule %s" % sked['schedule_name']) 37 | for part in sked['schedule_parts'].keys(): 38 | partname = part 39 | partdata = sked['schedule_parts'][part] 40 | #print("part %s %s" % (partname, partdata)) 41 | 42 | self.accumulator.add_model(partname, partdata) 43 | 44 | for groupname in sked['groups'].keys(): 45 | for groupdata in sked['groups'][groupname]: 46 | #print("group %s %s" % (groupname, groupdata) ) 47 | self.accumulator.add_model(groupname, groupdata) 48 | 49 | def run_filing(self, filing): 50 | # print (filing) 51 | 52 | object_id = filing.object_id 53 | 54 | parsed_filing = self.xml_runner.run_filing(object_id) 55 | if not parsed_filing: 56 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % ( 57 | filing, metadata_row)) 58 | return None 59 | 60 | schedule_list = parsed_filing.list_schedules() 61 | # print("sked list is %s" % schedule_list) 62 | 63 | result = parsed_filing.get_result() 64 | 65 | keyerrors = parsed_filing.get_keyerrors() 66 | 67 | if keyerrors: 68 | # If we find keyerrors--xpaths that are missing from our spec, note it 69 | print("Key error %s") 70 | has_keyerrors = len(keyerrors) > 0 71 | print("keyerror: %s" % keyerrors) 72 | filing.error_details = str(keyerrors) 73 | filing.key_error_count = len(keyerrors) 74 | filing.is_error = has_keyerrors 75 | filing.save() 76 | 77 | if result: 78 | for sked in result: 79 | self.process_sked(sked) 80 | else: 81 | print("Filing not parsed %s " % object_id) 82 | 83 | 84 | def __init__(self, queue): 85 | Thread.__init__(self) 86 | self.queue = queue 87 | 88 | def run(self): 89 | self.xml_runner = XMLRunner() 90 | self.accumulator = Accumulator() 91 | while True: 92 | filing = self.queue.get() 93 | self.run_filing(filing) 94 | self.queue.task_done() 95 | connection.close() 96 | 97 | class Command(BaseCommand): 98 | help = ''' 99 | Enter the filings, one by one. 100 | Loading is done in bulk, though status on the filings is updated one at a time. 101 | 102 | ''' 103 | 104 | def add_arguments(self, parser): 105 | # Positional arguments 106 | parser.add_argument('year', nargs=1, type=int) 107 | 108 | def setup(self): 109 | # get an XMLRunner -- this is what actually does the parsing 110 | self.xml_runner = XMLRunner() 111 | self.accumulator = Accumulator() 112 | 113 | 114 | def process_sked(self, sked): 115 | """ Enter just one schedule """ 116 | #print("Processing schedule %s" % sked['schedule_name']) 117 | for part in sked['schedule_parts'].keys(): 118 | partname = part 119 | partdata = sked['schedule_parts'][part] 120 | #print("part %s %s" % (partname, partdata)) 121 | 122 | self.accumulator.add_model(partname, partdata) 123 | 124 | for groupname in sked['groups'].keys(): 125 | for groupdata in sked['groups'][groupname]: 126 | #print("group %s %s" % (groupname, groupdata) ) 127 | self.accumulator.add_model(groupname, groupdata) 128 | 129 | 130 | def run_filing(self, filing): 131 | 132 | object_id = filing.object_id 133 | 134 | parsed_filing = self.xml_runner.run_filing(object_id) 135 | if not parsed_filing: 136 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) 137 | return None 138 | 139 | schedule_list = parsed_filing.list_schedules() 140 | #print("sked list is %s" % schedule_list) 141 | 142 | result = parsed_filing.get_result() 143 | 144 | keyerrors = parsed_filing.get_keyerrors() 145 | 146 | if keyerrors: 147 | # If we find keyerrors--xpaths that are missing from our spec, note it 148 | print("Key error %s") 149 | has_keyerrors = len(keyerrors) > 0 150 | print("keyerror: %s" % keyerrors) 151 | filing.error_details = str(keyerrors) 152 | filing.key_error_count = len(keyerrors) 153 | filing.is_error = has_keyerrors 154 | filing.save() 155 | 156 | if result: 157 | for sked in result: 158 | self.process_sked(sked) 159 | else: 160 | print("Filing not parsed %s " % object_id) 161 | 162 | def handle(self, *args, **options): 163 | 164 | year = int(options['year'][0]) 165 | if year not in [2014, 2015, 2016, 2017, 2018]: 166 | raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2018" % year) 167 | 168 | print("Running filings during year %s" % year) 169 | self.setup() 170 | 171 | process_count = 0 172 | 173 | queue = Queue() 174 | # Create 8 worker threads 175 | for x in range(8): 176 | worker = DownloadWorker(queue) 177 | # Setting daemon to True will let the main thread exit even though the workers are blocking 178 | worker.daemon = True 179 | worker.start() 180 | 181 | 182 | while True: 183 | filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100] 184 | if not filings: 185 | print("Done") 186 | break 187 | 188 | object_id_list = [f.object_id for f in filings] 189 | 190 | # record that processing has begun 191 | Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True) 192 | 193 | for filing in filings: 194 | # print(filing) 195 | queue.put(filing) 196 | process_count += 1 197 | if process_count % 1000 == 0: 198 | print("Handled %s filings" % process_count) 199 | 200 | queue.join() 201 | 202 | # commit anything that's left 203 | self.accumulator.commit_all() 204 | # record that all are complete 205 | Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True) 206 | print("Processed a total of %s filings" % process_count) 207 | # Causes the main thread to wait for the queue to finish processing all the tasks 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /irsdb/metadata/management/commands/generate_schemas_from_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from django.core.management.base import BaseCommand 4 | from django.conf import settings 5 | 6 | from metadata.models import Variable, Group, SchedulePart 7 | from schemas.documentation_utils import most_recent, debracket 8 | from schemas.type_utils import get_django_type, get_sqlalchemy_type 9 | 10 | GENERATED_MODELS_DIR = settings.GENERATED_MODELS_DIR 11 | KNOWN_SCHEDULES = settings.KNOWN_SCHEDULES 12 | CANONICAL_VERSION = '2016v3.0' 13 | soft_tab = ' ' 14 | 15 | class Command(BaseCommand): 16 | help = """ Generate django model file. 17 | Hard overwrites the default file. 18 | 19 | SQLAlchemy in development as a CLI option ( -sqlalchemy ) 20 | Pretty rough at this point 21 | 22 | """ 23 | 24 | 25 | def add_arguments(self, parser): 26 | parser.add_argument('--sqlalchemy', action='store_true') 27 | 28 | parser.add_argument( 29 | "--schedule", 30 | choices=KNOWN_SCHEDULES, 31 | default=None, 32 | help='Get only that schedule' 33 | ) 34 | 35 | def write_model_top(self, sked_name, full_name, parent_sked_name, repeating_group_part=None): 36 | print("Handing part %s %s" % (sked_name, full_name)) 37 | 38 | if self.run_django: 39 | 40 | result = "\n#######\n#\n# %s - %s\n" % (parent_sked_name, full_name) 41 | if repeating_group_part: 42 | result += "# A repeating structure from %s\n" % (repeating_group_part) 43 | result += "#\n#######\n" 44 | ## write the start of the first group: 45 | result += "\nclass %s(models.Model):\n" % sked_name 46 | result += soft_tab + "object_id = models.CharField(max_length=31, blank=True, null=True, help_text=\"unique xml return id\")\n" 47 | result += soft_tab + "ein = models.CharField(max_length=15, blank=True, null=True, help_text=\"filer EIN\")\n" 48 | if parent_sked_name=='IRS990ScheduleK': 49 | # It's not clear what the max length is; Return.xsd is unclear 50 | result += soft_tab + "documentId = models.TextField(blank=True, null=True, help_text=\"documentID attribute\")" 51 | 52 | 53 | return result 54 | 55 | elif self.run_sqlalchemy: 56 | 57 | result = "\n#######\n#\n# %s - %s\n" % (parent_sked_name, full_name) 58 | if repeating_group_part: 59 | result += "# A repeating structure from %s\n" % (repeating_group_part) 60 | result += "#\n#######\n" 61 | ## write the start of the first group: 62 | result += "\nclass %s(Base):\n%s__tablename__='%s'\n" % (sked_name,soft_tab, sked_name) 63 | result += soft_tab + "object_id = Column(String(31))\n" 64 | result += soft_tab + "ein = Column(String(15))\n" 65 | if parent_sked_name=='IRS990ScheduleK': 66 | result += soft_tab + "documentId = Column(String(15))\n" 67 | 68 | result += soft_tab + "id = Column(Integer, primary_key=True)\n" # Add a primary key explicitly 69 | 70 | return result 71 | 72 | def write_top_matter(self): 73 | if self.run_django: 74 | self.outfile.write("from django.db import models\n") 75 | elif self.run_sqlalchemy: 76 | self.outfile.write("from sqlalchemy import Column, Integer, String, BigInteger, Text, Numeric\n") 77 | self.outfile.write("from sqlalchemy.ext.declarative import declarative_base\n\n") 78 | self.outfile.write("Base = declarative_base()\n\n") 79 | 80 | 81 | def write_variable(self, variable): 82 | """ 83 | We fallback to a text field, but we expect the types to be filled in where missing 84 | """ 85 | print("Write variable name %s type %s" % (variable.db_name, variable.db_type)) 86 | if self.run_django: 87 | variable_output = get_django_type(variable.irs_type) 88 | result = "\n" + soft_tab + "%s = models.%s" % (variable.db_name, variable_output) 89 | 90 | elif self.run_sqlalchemy: 91 | variable_output = get_sqlalchemy_type(variable.irs_type) 92 | result = "\n" + soft_tab + "%s = Column(%s)" % (variable.db_name, variable_output) 93 | 94 | # add newline and documentation regardless of where it's going 95 | result += "\n" + soft_tab + "#" 96 | if variable.line_number: 97 | result += " Line number: %s " % most_recent(debracket(variable.line_number)) 98 | if variable.description: 99 | result += " Description: %s " % most_recent(debracket(variable.description)) 100 | result += " most recent xpath: %s \n" % variable.xpath 101 | 102 | return result 103 | 104 | def write_sked(self, schedule): 105 | print("Handling schedule %s" % (schedule)) 106 | 107 | 108 | form_parts = SchedulePart.objects.filter(parent_sked=schedule).order_by('ordering') 109 | for form_part in form_parts: 110 | 111 | model_top = self.write_model_top( 112 | form_part.parent_sked_part, 113 | form_part.part_name, 114 | schedule 115 | ) 116 | 117 | variables_in_this_part = Variable.objects.filter( 118 | parent_sked_part=form_part.parent_sked_part, 119 | version_end__in=['','2016', '2017', '2018'], 120 | ).exclude(in_a_group=True).order_by('ordering',) 121 | if variables_in_this_part: 122 | # only write it if it contains anything 123 | self.outfile.write(model_top) 124 | print(model_top) 125 | 126 | for variable in variables_in_this_part: 127 | this_var = self.write_variable(variable) 128 | print(this_var) 129 | self.outfile.write(this_var) 130 | 131 | 132 | 133 | groups_in_this_sked = Group.objects.filter( 134 | parent_sked=schedule, 135 | version_end='', 136 | ).order_by('ordering',) 137 | 138 | for group in groups_in_this_sked: 139 | name = group.db_name 140 | if group.description: 141 | name += " - " + group.description 142 | model_top = self.write_model_top( 143 | group.db_name, 144 | name, 145 | schedule, 146 | repeating_group_part=group.parent_sked_part 147 | ) 148 | 149 | variables_in_this_group = Variable.objects.filter( 150 | db_table=group.db_name, 151 | version_end__in=['','2016', '2017', '2018'], 152 | ).order_by('ordering',) 153 | 154 | if variables_in_this_group: 155 | # only write it if it contains anything 156 | self.outfile.write(model_top) 157 | print(model_top) 158 | 159 | for variable in variables_in_this_group: 160 | this_var = self.write_variable(variable) 161 | print(this_var) 162 | self.outfile.write(this_var) 163 | 164 | 165 | def handle(self, *args, **options): 166 | print(options) 167 | self.run_sqlalchemy = options['sqlalchemy'] 168 | self.run_django = not self.run_sqlalchemy # Only run one or the other. 169 | 170 | file_output = os.path.join(GENERATED_MODELS_DIR, "django_models_auto.py") 171 | if self.run_sqlalchemy: 172 | file_output = os.path.join(GENERATED_MODELS_DIR, "sqlalchemy_models_auto.py") 173 | self.outfile = open(file_output, 'w') 174 | 175 | self.write_top_matter() 176 | 177 | schedulename = options.get('schedule') 178 | if schedulename: 179 | print("Handling schedule %s" % schedulename) 180 | self.write_sked(schedulename) 181 | else: 182 | for schedulename in KNOWN_SCHEDULES: 183 | print("Handling schedule %s" % schedulename) 184 | self.write_sked(schedulename) 185 | 186 | -------------------------------------------------------------------------------- /irsdb/return/sql/delete_all_return.sql: -------------------------------------------------------------------------------- 1 | ------------ 2 | -- 3 | -- !! Removes every entry from all of the return tables. Use with caution !! 4 | -- 5 | -- Also resets the tracking fields in Filing so that each starts "fresh" 6 | -- and load_files.py will try to reload these. 7 | -- (If parse_complete is not reset, load_files will think it's already "done" and skip it) 8 | ------------ 9 | 10 | delete from return_cntrctrcmpnstn; 11 | delete from return_ez_part_0; 12 | delete from return_ez_part_i; 13 | delete from return_ez_part_ii; 14 | delete from return_ez_part_iii; 15 | delete from return_ez_part_iv; 16 | delete from return_ez_part_v; 17 | delete from return_ez_part_vi; 18 | delete from return_ezcmpnstnhghstpdempl; 19 | delete from return_ezcmpnstnofhghstpdcntrct; 20 | delete from return_ezfrgnfnnclaccntcntrycd; 21 | delete from return_ezfrgnoffccntrycd; 22 | delete from return_ezoffcrdrctrtrstempl; 23 | delete from return_ezprgrmsrvcaccmplshmnt; 24 | delete from return_ezspclcndtndsc; 25 | delete from return_ezsttswhrcpyofrtrnisfldcd; 26 | delete from return_frgncntrycd; 27 | delete from return_frm990prtviisctna; 28 | delete from return_othrexpnss; 29 | delete from return_othrrvnmsc; 30 | delete from return_part_0; 31 | delete from return_part_i; 32 | delete from return_part_iii; 33 | delete from return_part_iv; 34 | delete from return_part_ix; 35 | delete from return_part_v; 36 | delete from return_part_vi; 37 | delete from return_part_vii; 38 | delete from return_part_viii; 39 | delete from return_part_x; 40 | delete from return_part_xi; 41 | delete from return_part_xii; 42 | delete from return_pf_part_0; 43 | delete from return_pf_part_i; 44 | delete from return_pf_part_ii; 45 | delete from return_pf_part_iii; 46 | delete from return_pf_part_iv; 47 | delete from return_pf_part_ixa; 48 | delete from return_pf_part_ixb; 49 | delete from return_pf_part_v; 50 | delete from return_pf_part_vi; 51 | delete from return_pf_part_viia; 52 | delete from return_pf_part_viib; 53 | delete from return_pf_part_viii; 54 | delete from return_pf_part_x; 55 | delete from return_pf_part_xi; 56 | delete from return_pf_part_xii; 57 | delete from return_pf_part_xiii; 58 | delete from return_pf_part_xiv; 59 | delete from return_pf_part_xv; 60 | delete from return_pf_part_xvia; 61 | delete from return_pf_part_xvib; 62 | delete from return_pf_part_xvii; 63 | delete from return_pfapplctnsbmssninf; 64 | delete from return_pfcmpnstnhghstpdempl; 65 | delete from return_pfcmpnstnofhghstpdcntrct; 66 | delete from return_pfcntrbtngmngrnm; 67 | delete from return_pfcpgnslsstxinvstincm; 68 | delete from return_pffrgncntrycd; 69 | delete from return_pfgrntorcntrapprvfrft; 70 | delete from return_pfgrntorcntrbtnpddryr; 71 | delete from return_pfoffcrdrtrstkyempl; 72 | delete from return_pforgrprtorrgstrsttcd; 73 | delete from return_pfothrrvndscrbd; 74 | delete from return_pfprgrmsrvcrvprtvii; 75 | delete from return_pfrlnofactytaccmofexmptprps; 76 | delete from return_pfrltnshpskddtl; 77 | delete from return_pfshrhldrmngrnm; 78 | delete from return_pfspclcndtndsc; 79 | delete from return_pftrnsfrskddtl; 80 | delete from return_prgrmsrvcrvn; 81 | delete from return_prgsrvcaccmactyothr; 82 | delete from return_returnheader990x_part_i; 83 | delete from return_skdaagrcltrlnmandaddrss; 84 | delete from return_skdafrm990skdaprtvi; 85 | delete from return_skdahsptlnmandaddrss; 86 | delete from return_skdaspprtdorginfrmtn; 87 | delete from return_skdbchrtblcntrbtnsdtl; 88 | delete from return_skdbcntrbtrinfrmtn; 89 | delete from return_skdbnncshprprtycntrbtn; 90 | delete from return_skdcsctn527pltclorg; 91 | delete from return_skdcspplmntlinfrmtndtl; 92 | delete from return_skddinvstprgrmrltdorg; 93 | delete from return_skddothrasstsorg; 94 | delete from return_skddothrlbltsorg; 95 | delete from return_skddothrscrts; 96 | delete from return_skddspplmntlinfrmtndtl; 97 | delete from return_skdespplmntlinfrmtndtl; 98 | delete from return_skdfaccntactvtsotsdus; 99 | delete from return_skdffrgnindvdlsgrnts; 100 | delete from return_skdfgrntstorgotsdus; 101 | delete from return_skdfspplmntlinfrmtndtl; 102 | delete from return_skdgfndrsractvtyinf; 103 | delete from return_skdglcnsdsttscd; 104 | delete from return_skdgspplmntlinfrmtndtl; 105 | delete from return_skdgsttswhrgmngcndctdcd; 106 | delete from return_skdhhsptlfclts; 107 | delete from return_skdhhsptlfcltyplcsprctc; 108 | delete from return_skdhmngmntcandjntvntrs; 109 | delete from return_skdhothhlthcrfclts; 110 | delete from return_skdhspplmntlinfrmtn; 111 | delete from return_skdhspplmntlinfrmtndtl; 112 | delete from return_skdigrntsothrassttindvinus; 113 | delete from return_skdircpnttbl; 114 | delete from return_skdispplmntlinfrmtndtl; 115 | delete from return_skdjrltdorgoffcrtrstkyempl; 116 | delete from return_skdjspplmntlinfrmtndtl; 117 | delete from return_skdkprcdrscrrctvactn; 118 | delete from return_skdkspplmntlinfrmtndtl; 119 | delete from return_skdktxexmptbndsarbtrg; 120 | delete from return_skdktxexmptbndsisss; 121 | delete from return_skdktxexmptbndsprcds; 122 | delete from return_skdktxexmptbndsprvtbsus; 123 | delete from return_skdlbstrinvlvintrstdprsn; 124 | delete from return_skdldsqlfdprsnexbnfttr; 125 | delete from return_skdlgrntasstbnftintrstdprsn; 126 | delete from return_skdllnsbtwnorgintrstdprsn; 127 | delete from return_skdlspplmntlinfrmtndtl; 128 | delete from return_skdmothrnncshcntrtbl; 129 | delete from return_skdmspplmntlinfrmtndtl; 130 | delete from return_skdndspstnofasstsdtl; 131 | delete from return_skdnlqdtnofasstsdtl; 132 | delete from return_skdnspplmntlinfrmtndtl; 133 | delete from return_skdospplmntlinfrmtndtl; 134 | delete from return_skdriddsrgrddentts; 135 | delete from return_skdridrltdorgtxblcrptr; 136 | delete from return_skdridrltdorgtxblprtnrshp; 137 | delete from return_skdridrltdtxexmptorg; 138 | delete from return_skdrspplmntlinfrmtndtl; 139 | delete from return_skdrtrnsctnsrltdorg; 140 | delete from return_skdrunrltdorgtxblprtnrshp; 141 | delete from return_skeda_part_i; 142 | delete from return_skeda_part_ii; 143 | delete from return_skeda_part_iii; 144 | delete from return_skeda_part_iv; 145 | delete from return_skeda_part_v; 146 | delete from return_skeda_part_vi; 147 | delete from return_skedb_part_0; 148 | delete from return_skedb_part_ii; 149 | delete from return_skedc_part_0; 150 | delete from return_skedc_part_iia; 151 | delete from return_skedc_part_iib; 152 | delete from return_skedc_part_iiia; 153 | delete from return_skedc_part_iiib; 154 | delete from return_skedd_part_i; 155 | delete from return_skedd_part_ii; 156 | delete from return_skedd_part_iii; 157 | delete from return_skedd_part_iv; 158 | delete from return_skedd_part_ix; 159 | delete from return_skedd_part_v; 160 | delete from return_skedd_part_vi; 161 | delete from return_skedd_part_vii; 162 | delete from return_skedd_part_viii; 163 | delete from return_skedd_part_x; 164 | delete from return_skedd_part_xi; 165 | delete from return_skedd_part_xii; 166 | delete from return_skede_part_i; 167 | delete from return_skedf_part_i; 168 | delete from return_skedf_part_ii; 169 | delete from return_skedf_part_iv; 170 | delete from return_skedg_part_i; 171 | delete from return_skedg_part_ii; 172 | delete from return_skedg_part_iii; 173 | delete from return_skedh_part_i; 174 | delete from return_skedh_part_ii; 175 | delete from return_skedh_part_iii; 176 | delete from return_skedh_part_va; 177 | delete from return_skedh_part_vd; 178 | delete from return_skedi_part_i; 179 | delete from return_skedi_part_ii; 180 | delete from return_skedj_part_i; 181 | delete from return_skedl_part_i; 182 | delete from return_skedl_part_ii; 183 | delete from return_skedm_part_i; 184 | delete from return_skedn_part_i; 185 | delete from return_skedn_part_ii; 186 | delete from return_skedr_part_v; 187 | delete from return_spclcndtndsc; 188 | delete from return_sttswhrcpyofrtrnisfldcd; 189 | 190 | 191 | -- Now reset tracking error fields. 192 | 193 | 194 | update filing_filing set parse_started=False where parse_started = True; 195 | update filing_filing set parse_complete=False where parse_complete = True; 196 | update filing_filing set process_time=Null where not process_time is Null; 197 | 198 | update filing_filing set is_error=False where is_error = True; 199 | update filing_filing set key_error_count=Null where not key_error_count is Null; 200 | update filing_filing set error_details =Null where not error_details is Null; 201 | -------------------------------------------------------------------------------- /irsdb/metadata/views.py: -------------------------------------------------------------------------------- 1 | from irsx._version import __version__ as irsx_version 2 | from datetime import datetime 3 | from django.shortcuts import get_object_or_404, render 4 | from django.conf import settings 5 | from django.db import connection 6 | from django.template.loader import render_to_string 7 | 8 | from .models import Variable, LineNumber, Description, SchedulePart, Group 9 | 10 | 11 | 12 | KNOWN_SCHEDULES = settings.KNOWN_SCHEDULES 13 | 14 | 15 | # We're too low rent to install django-bakery 16 | # in the future we should use CBV's and use it 17 | # This app is odd in that it tries to only consume 18 | # published metadata .csvs, hence oddness in the models 19 | # which reflect the files, rather than the data 20 | 21 | # The base of the file system 22 | try: 23 | FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE 24 | except ImportError: 25 | FILE_SYSTEM_BASE = '' 26 | # When set to true will 'cache' a baked version of the page 27 | # To run a full bake, run a 'scrape' of every page that needs update 28 | # Then deploy files 29 | # Any new static files need to be moved into place, this is just a hack 30 | 31 | BAKE_OUT = True 32 | 33 | def bake(request, template, context, filepath=None): 34 | path = request.META['PATH_INFO'] 35 | if filepath: 36 | path = filepath 37 | full_path = FILE_SYSTEM_BASE + path # should be an os.join process here 38 | 39 | 40 | print("Bake with full_path = %s" % full_path) 41 | with open(full_path, "w") as f: 42 | f.write(render_to_string(template, context)) 43 | 44 | 45 | def show_xpath(request, xpath): 46 | """ 47 | Show a single xpath 48 | """ 49 | raw_xpath = xpath 50 | xpath = xpath.replace("-","/") 51 | 52 | print("Xpath is '%s'" % xpath) 53 | this_variable = get_object_or_404(Variable, xpath=xpath) 54 | line_numbers = LineNumber.objects.filter(xpath=xpath) 55 | descriptions = Description.objects.filter(xpath=xpath) 56 | if len(line_numbers)<2: 57 | line_numbers = None 58 | if len(descriptions)<2: 59 | descriptions = None 60 | 61 | context = { 62 | 'this_variable': this_variable, 63 | 'line_numbers':line_numbers, 64 | 'descriptions':descriptions 65 | } 66 | template = 'metadata/xpath.html' 67 | 68 | if BAKE_OUT: 69 | filepath = "/metadata/xpath/" + raw_xpath + ".html" 70 | bake(request, template, context, filepath=filepath) 71 | 72 | return render(request, template, context) 73 | 74 | def show_about(request): 75 | context = { 76 | 'version':irsx_version, 77 | 'update':datetime.now(), 78 | } 79 | template = 'metadata/about.html' 80 | 81 | if BAKE_OUT: 82 | bake(request, template, context) 83 | return render(request, template, context) 84 | 85 | def show_variable(request, db_name, variable_name): 86 | """ 87 | Show a single variable 88 | """ 89 | print("Variable is '%s'" % variable_name) 90 | variables = Variable.objects.filter(db_table=db_name, db_name=variable_name) 91 | this_variable = variables[0] 92 | xpaths = variables.values_list('xpath', 'version_start', 'version_end') 93 | result_xpaths = [] 94 | for xpath in xpaths: 95 | result_xpaths.append({ 96 | 'xpath':xpath[0], 97 | 'url':"/metadata/xpath/" + xpath[0].replace("/","-") + ".html", 98 | 'version_start':xpath[1], 99 | 'version_end':xpath[2], 100 | }) 101 | 102 | print("xpaths are %s" % result_xpaths) 103 | 104 | 105 | this_variable = variables[0] 106 | context = { 107 | 'this_variable': this_variable, 108 | 'xpaths':result_xpaths 109 | } 110 | template = 'metadata/variable.html' 111 | 112 | if BAKE_OUT: 113 | filepath = this_variable.get_absolute_url() 114 | bake(request, template, context) 115 | return render(request, template, context) 116 | 117 | def show_part(request, part): 118 | this_part = get_object_or_404(SchedulePart, parent_sked_part=part) 119 | related_groups = Group.objects.filter(parent_sked_part=part) 120 | groups = [] 121 | group_names = [] 122 | for group in related_groups: 123 | if group.db_name not in group_names: 124 | groups.append({ 125 | 'db_name':group.db_name, 126 | 'get_absolute_url':group.get_absolute_url() 127 | }) 128 | group_names.append(group.db_name) 129 | 130 | variables = Variable.objects.filter(parent_sked_part=part, in_a_group=False).exclude(version_end__in=['2013', '2014', '2015']).order_by('line_number', 'ordering') 131 | context = { 132 | 'this_part': this_part, 133 | 'variables':variables, 134 | 'related_groups':groups, 135 | } 136 | template = 'metadata/part.html' 137 | 138 | if BAKE_OUT: 139 | bake(request, template, context) 140 | return render(request, template, context) 141 | 142 | def show_group(request, group): 143 | this_group = Group.objects.filter(db_name=group)[0] 144 | variables = Variable.objects.filter(db_table=group).exclude(version_end__in=['2013', '2014', '2015']).order_by('line_number', 'ordering') 145 | 146 | template = 'metadata/group.html' 147 | context = { 148 | 'this_group': this_group, 149 | 'variables':variables, 150 | } 151 | 152 | if BAKE_OUT: 153 | bake(request, template, context) 154 | return render(request, template,context ) 155 | 156 | def join_groups_to_parts(): 157 | with connection.cursor() as cursor: 158 | # Sigh. 159 | RAW_SQL = """ 160 | SELECT DISTINCT 161 | metadata_group.parent_sked, 162 | metadata_group.parent_sked_part, 163 | metadata_group.db_name, 164 | metadata_schedulepart.ordering 165 | FROM 166 | metadata_group 167 | LEFT JOIN 168 | metadata_schedulepart 169 | ON metadata_group.parent_sked_part = metadata_schedulepart.parent_sked_part 170 | AND metadata_group.parent_sked = metadata_schedulepart.parent_sked 171 | ORDER BY 172 | metadata_group.parent_sked, 173 | metadata_schedulepart.ordering; 174 | """ 175 | cursor.execute(RAW_SQL) 176 | rows = cursor.fetchall() 177 | result_obj = [] 178 | for row in rows: 179 | result_obj.append({ 180 | 'parent_sked':row[0], 181 | 'parent_sked_part':row[1], 182 | 'group_name':row[2], 183 | }) 184 | return result_obj 185 | 186 | 187 | def show_forms(request): 188 | """ 189 | Show all form parts - this is gnarly and should be baked / cached 190 | """ 191 | parts = SchedulePart.objects.all().order_by('parent_sked','ordering') 192 | form_hash = {} 193 | part_hash = {} 194 | 195 | # Sorta laboriously rebuild data structure from metadata.csv files. They weren't designed for this! 196 | for schedule in KNOWN_SCHEDULES: 197 | form_hash[schedule] = {'schedule_name':schedule, 'parts':[]} 198 | for part in parts: 199 | try: 200 | form_hash[part.parent_sked]['parts'].append(part) 201 | except KeyError: 202 | form_hash[part.parent_sked] = {'schedule_name':part.parent_sked, 'parts':[part]} 203 | 204 | sked_part_hash = {} 205 | joined_groups = join_groups_to_parts() 206 | for jg in joined_groups: 207 | try: 208 | sked_part_hash[jg['parent_sked_part']]['groups'].append(jg['group_name']) 209 | except KeyError: 210 | sked_part_hash[jg['parent_sked_part']] = {'groups':[jg['group_name']]} 211 | 212 | return_array = [] 213 | for fkey in form_hash.keys(): 214 | this_data_obj = {'sked_name':fkey, 'parts':[]} 215 | for i, part in enumerate(form_hash[fkey]['parts']): 216 | part_obj = {} 217 | part_obj['part'] = part 218 | part_obj['groups'] = [] 219 | part_obj['name'] = part.parent_sked_part 220 | try: 221 | groups = sked_part_hash[part.parent_sked_part]['groups'] 222 | part_obj['groups'] = groups 223 | except KeyError: 224 | part_obj['groups'] = '' 225 | this_data_obj['parts'].append(part_obj) 226 | return_array.append(this_data_obj) 227 | 228 | print(return_array) 229 | template = 'metadata/forms.html' 230 | context = { 231 | 'forms':return_array 232 | } 233 | if BAKE_OUT: 234 | bake(request, template, context) 235 | return render(request, template, context) -------------------------------------------------------------------------------- /contractors.sh: -------------------------------------------------------------------------------- 1 | -- Contractor compensation 2 | 3 | 4 | -- Form 990: 5 | 6 | 7 | DROP TABLE IF EXISTS contractor_comp_990; 8 | 9 | SELECT 10 | address_table.ein, 11 | address_table.object_id, 12 | address_table."RtrnHdr_TxPrdEndDt", 13 | address_table."RtrnHdr_TxYr", 14 | address_table."BsnssOffcr_SgntrDt", 15 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 16 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 17 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 18 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt", 19 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" , 20 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" , 21 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 22 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 23 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 24 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 25 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 26 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 27 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 28 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 29 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 30 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 31 | return_CntrctrCmpnstn."CntrctrNm_PrsnNm" as "CntrctrNm_PrsnNm", 32 | trim(concat(return_CntrctrCmpnstn."BsnssNm_BsnssNmLn1Txt", ' ', return_CntrctrCmpnstn."BsnssNm_BsnssNmLn2Txt")) as "Cntrctr_Business", 33 | trim(concat(return_CntrctrCmpnstn."USAddrss_AddrssLn1Txt", ' ', return_CntrctrCmpnstn."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1", 34 | trim(concat(return_CntrctrCmpnstn."USAddrss_AddrssLn2Txt", ' ', return_CntrctrCmpnstn."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2", 35 | trim(concat(return_CntrctrCmpnstn."USAddrss_CtyNm", ' ', return_CntrctrCmpnstn."FrgnAddrss_CtyNm")) as "Cntrctr_City", 36 | trim(concat(return_CntrctrCmpnstn."USAddrss_ZIPCd", ' ', return_CntrctrCmpnstn."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP", 37 | trim(concat(return_CntrctrCmpnstn."USAddrss_SttAbbrvtnCd" , ' ', return_CntrctrCmpnstn."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State", 38 | return_CntrctrCmpnstn."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd", 39 | return_CntrctrCmpnstn."CntrctrCmpnstn_SrvcsDsc" as "SrvcsDsc", 40 | return_CntrctrCmpnstn."CntrctrCmpnstn_CmpnstnAmt" as "CmpnstnAmt" 41 | INTO TEMPORARY TABLE contractor_comp_990 42 | FROM return_CntrctrCmpnstn 43 | LEFT JOIN address_table ON return_CntrctrCmpnstn.object_id = address_table.object_id 44 | AND return_CntrctrCmpnstn.ein = address_table.ein; 45 | 46 | 47 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990' as form, contractor_comp_990.* into temporary table contractor_comp_990_types from contractor_comp_990 left join org_types on contractor_comp_990.object_id = org_types.object_id and contractor_comp_990.ein = org_types.ein; 48 | 49 | 50 | \copy contractor_comp_990_types to '/data/file_exports/contractors_990.csv' with csv header; 51 | 52 | 53 | 54 | -- 990 PF 55 | 56 | DROP TABLE IF EXISTS contractor_comp_990_pf; 57 | 58 | SELECT 59 | address_table.ein, 60 | address_table.object_id, 61 | address_table."RtrnHdr_TxPrdEndDt", 62 | address_table."RtrnHdr_TxYr", 63 | address_table."BsnssOffcr_SgntrDt", 64 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 65 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 66 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 67 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt", 68 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" , 69 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" , 70 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 71 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 72 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 73 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 74 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 75 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 76 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 77 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 78 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 79 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 80 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_PrsnNm" as "CntrctrNm_PrsnNm", 81 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_BsnssNmLn1", ' ', return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_BsnssNmLn2")) as "Cntrctr_Business", 82 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_AddrssLn1Txt", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1", 83 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_AddrssLn2Txt", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2", 84 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_CtyNm", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_CtyNm")) as "Cntrctr_City", 85 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_ZIPCd", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP", 86 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_SttAbbrvtnCd" , ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State", 87 | return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd", 88 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_SrvcTxt" as "SrvcsDsc", 89 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_CmpnstnAmt" as "CmpnstnAmt" 90 | INTO TEMPORARY TABLE contractor_comp_990_pf 91 | FROM return_PFCmpnstnOfHghstPdCntrct 92 | LEFT JOIN address_table ON return_PFCmpnstnOfHghstPdCntrct.object_id = address_table.object_id 93 | AND return_PFCmpnstnOfHghstPdCntrct.ein = address_table.ein; 94 | 95 | 96 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990PF' as form, contractor_comp_990_pf.* into temporary table contractor_comp_990_pf_types from contractor_comp_990_pf left join org_types on contractor_comp_990_pf.object_id = org_types.object_id and contractor_comp_990_pf.ein = org_types.ein; 97 | 98 | 99 | \copy contractor_comp_990_pf_types to '/data/file_exports/contractor_comp_990_pf.csv' with csv header; 100 | 101 | 102 | -- 990EZ 103 | 104 | DROP TABLE IF EXISTS contractor_comp_990_ez; 105 | 106 | SELECT 107 | address_table.ein, 108 | address_table.object_id, 109 | address_table."RtrnHdr_TxPrdEndDt", 110 | address_table."RtrnHdr_TxYr", 111 | address_table."BsnssOffcr_SgntrDt", 112 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 113 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 114 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 115 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt", 116 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" , 117 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" , 118 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 119 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 120 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 121 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 122 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 123 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 124 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 125 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 126 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 127 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 128 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_PrsnNm" as "CntrctrNm_PrsnNm", 129 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_BsnssNmLn1", ' ', return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_BsnssNmLn2")) as "Cntrctr_Business", 130 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_AddrssLn1Txt", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1", 131 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_AddrssLn2Txt", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2", 132 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_CtyNm", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_CtyNm")) as "Cntrctr_City", 133 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_ZIPCd", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP", 134 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_SttAbbrvtnCd" , ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State", 135 | return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd", 136 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_SrvcTxt" as "SrvcsDsc", 137 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_CmpnstnAmt" as "CmpnstnAmt" 138 | INTO TEMPORARY TABLE contractor_comp_990_ez 139 | FROM return_EZCmpnstnOfHghstPdCntrct 140 | LEFT JOIN address_table ON return_EZCmpnstnOfHghstPdCntrct .object_id = address_table.object_id 141 | AND return_EZCmpnstnOfHghstPdCntrct .ein = address_table.ein; 142 | 143 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990EZ' as form, contractor_comp_990_ez.* into temporary table contractor_comp_990_ez_types from contractor_comp_990_ez left join org_types on contractor_comp_990_ez.object_id = org_types.object_id and contractor_comp_990_ez.ein = org_types.ein; 144 | 145 | \copy contractor_comp_990_ez_types to '/data/file_exports/contractor_comp_990_ez.csv' with csv header; 146 | 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 990-xml-database 2 | Django app to consume and store 990 data and metadata. Depends on [IRSx](https://github.com/jsfenfen/990-xml-reader) (which is installed as a dependency below). 3 | 4 | ## Setup and use 5 | 6 | ### Part 1: clone the repo and configure the app 7 | 8 | 1. git clone this repository `git clone https://github.com/jsfenfen/990-xml-database.git` and `$ cd 990-xml-database` 9 | 10 | 2. install the requirements with `pip install -r requirements.txt`. This is Django 2, so only python3 is supported. 11 | 12 | 3. copy the irsdb/local\_settings.py-example file to irsdb\/local_settings.py and edit it to reflect your database settings. 13 | 14 | 15 | ### Part 2: Add the metadata 16 | 17 | 18 | 1. run `python manage.py makemigrations metadata` to generate the metadata migrations, and then run them with `python manage.py migrate metadata`. 19 | 20 | 2. Load the metadata with from source csv files in generated\_schemas with the management command: `python manage.py load_metadata`. This command erases the metadata before loading, so it can be rerun if it somehow breaks in the middle. 21 | 22 | 3. If the csv files have changed you can generate migrations for the db by generating the models with `python manage.py generate_schemas_from_metadata` which puts the new models file in generated\_schemas/ as `django_models_auto.py` and then moving the generated models file into return/models.py and running `python manage.py makemigrations return`. 23 | 24 | ### Part 3: index file data 25 | 26 | The IRS releases metadata files which include the unique id, EIN and other information about each .xml filing. We need to put this in the database to make sense of the raw filings. 27 | 28 | 1. run `python manage.py makemigrations filing` to generate the filing migrations, and then run them with `python manage.py migrate filing`. 29 | 30 | 2. Run `$ python manage.py enter_yearly_submissions ` where YYYY is a the year corresponding to a yearly index file that has already been downloaded. { If it hasn't been downloaded you can retrieve it with irsx_index --year=YYYY }. This script checks to see if the IRS' index file is any bigger than the one one disk, and only runs if it has. You can force it to try to enter any new filings (regardless of whether the file is updated) with the `--enter` option. 31 | 32 | #### Sidebar: 2014 file may need fixing 33 | __There's a problem with the 2014 index file.__ An internal comma has "broken" the .csv format for some time. You can fix it with a perl one liner (which first backs the file up to index_2014.csv.bak before modifying it) 34 | 35 | $ perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' index_2014.csv 36 | 37 | We can see that it worked by diffing it. 38 | 39 | $ diff index_2014.csv index_2014.csv.bak 40 | 39569c39569 41 | < 11146506,EFILE,136171217,201212,1/14/2014,MOSTYN FOUNDATION INC CO SILVERCREST ASSET MANAGEMENT,990PF,93491211007003,201302119349100700 42 | --- 43 | > 11146506,EFILE,136171217,201212,1/14/2014,MOSTYN FOUNDATION INC CO SILVERCREST ASSET ,AMAGEMENT,990PF,93491211007003,201302119349100700 44 | 45 | For more details see [here](https://github.com/jsfenfen/990-xml-reader/blob/master/2014_is_broken.md). 46 | 47 | ### Part 5: Generate the schema files - Not recommended, this is only used when regenerating models for a new IRSX version 48 | 49 | Run `$ python manage.py generate_schemas_from_metadata` to generate a django models file (to the directory generated_models). You can modify these and put them into return/models. 50 | 51 | ### Part 6. Create the return tables 52 | 53 | Create the tables in the return model by running the migrations. 54 | 55 | `$ python manage.py makemigrations return` 56 | To make the migrations and 57 | `$ python manage.py migrate return` 58 | to run them. 59 | 60 | ### Part 7. Load the filings 61 | 62 | Actually enter the filings into the database with 63 | `$ python manage.py load_filings `. 64 | 65 | This script will take a while to run--probably at least several hours per year. You'll likely want to run it using nohup, so something like this: 66 | 67 | 68 | `$ nohup python manage.py load_filings &` 69 | 70 | Which detaches the terminal from the process, so if your connection times out the command keeps running. 71 | 72 | You may want to adjust your postgres settings for better loading, but you'll need to pay attention to overall memory and resource uses. 73 | 74 | ### Post-loading concerns 75 | 76 | 77 | #### Analyze the load process 78 | 79 | The loading process uses columns in the filing model to track load process (and to insure the same files aren't loaded twice). 80 | 81 | TK - explanation of keyerrors 82 | 83 | 84 | #### Removing all rows 85 | 86 | There's a [sql script](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/sql/delete_all_return.sql) that will remove all entered rows from all return tables and reset the fields in filing as if they were new. 87 | 88 | If you want to live dangerously, you can run it from the console like this: 89 | 90 | `$ python manage.py dbshell < ./return/sql/delete_all_return.sql` 91 | 92 | 93 | #### Adding or removing indexes 94 | 95 | There are management commands to create or drop indexes on object\_id, ein and (for schedule K) documentId. Use 96 | `$ python manage.py make_indexes` or 97 | `$ python manage.py drop_indexes` . These are just conveniences to create indexes named xx_\ --they won't remove other indexes. 98 | 99 | #### Removing a subset of all rows 100 | 101 | You can remove all filings from a given index file with the [remove_year](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/management/commands/remove_year.py). It's likely to run faster if indexes are in place. 102 | 103 | #### Removing only the rows that were half loaded 104 | 105 | If loading gets interrupted, you can remove only the rows where parse\_started is true and parse\_complete is not with the management command [remove\_half\_loaded](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/management/commands/remove_half_loaded.py). It also requires a year as a command line argument. 106 | 107 | `$ python manage.py remove_half_loaded 2018` 108 | 109 | #### File size concerns 110 | 111 | The full download of uncompressed .xml files is over ~74 gigabytes. Processing a complete year of data probably entails moving at least 15 gigs of xml. 112 | 113 | You probably want to look into a tool to help you move these files in bulk. AWS' S3 CLI can dramatically reduce download time, but seems unhelpful when trying to pull a subset of files (it seems like [--exclude '*'](https://docs.aws.amazon.com/cli/latest/reference/s3/index.html#use-of-exclude-and-include-filters) hangs when processing so many files). You may want to look into moving all the files to your own S3 bucket as well. There are also alternatives to AWS' CLI tool, like [S3 CMD](http://s3tools.org/s3cmd). 114 | 115 | You'll also want to [configure IRSx file cache directory](https://github.com/jsfenfen/990-xml-reader/#configuring-the-file-cache-directory) to set the WORKING_DIRECTORY variable to the file path of the folder where the xml files are located. 116 | 117 | The worst option is to download the uncompressed files one at a time. That sounds, really, really slow. 118 | 119 | 120 | #### Server considerations 121 | 122 | With most hosting providers, you'll need to configure additional storage to support the static files and the database that's ultimately loaded. Make sure that you set the database storage directory to *that storage*, and get the fastest storage type you can afford. 123 | 124 | You may want to look into tuning your database parameters to better support data loading. And you'll get better performance if you only create indexes after loading is complete (and delete them before bulk loads take place). 125 | 126 | One random datapoint: on an Amazon t2.medium ec2 server (~$38/month) with 150 gigs of additional storage and postgres running on the default configs and writing to an SSD EBS volume, load time for the complete set of about 490,000 filings from 2017 took about 3 hours. 127 | 128 | #### Monthly load 129 | 130 | This assumes no schema changes are required, which is usually the case. 131 | 132 | Run an S3 sync to the location of the fillings. The whole collection is now over 80 GB, make sure you have room. You can also retrieve some other way (if you don't retrieve en masse the load_filings.py script will attempt to download one filing at a time). It's useful to run this with nohup, i.e. 133 | 134 | nohup aws s3 sync s3://irs-form-990/ ./ & 135 | 136 | Then update the index file data 137 | 138 | $ python manage.py enter_yearly_submissions 2018 139 | 140 | 141 | index_2018.csv has changed. Downloading updated file... 142 | Done! 143 | Entering xml submissions from /home/webuser/virt/env/lib/python3.5/site-packages/irsx/CSV/index_2018.csv 144 | 145 | Committing 10000 total entered=10000 146 | commit complete 147 | Committing 10000 total entered=20000 148 | commit complete 149 | Added 24043 new entries. 150 | 151 | Then enter the filings into the relational database with: 152 | 153 | $ python manage.py load_filings 2018 154 | 155 | Running filings during year 2018 156 | Processed a total of 100 filings 157 | Processed a total of 200 filings 158 | Processed a total of 300 filings 159 | 160 | ... 161 | 162 | Handled 24000 filings 163 | Processed a total of 24000 filings 164 | Processed a total of 24043 filings 165 | Done 166 | 167 | This script finds filings where `submission_year` is the entered year and `parse_complete` has not been set to True. It enters them in groups of 100 and sets `parse_complete` to True after each batch has completed. The script is fairly fault tolerant, but if it dies in the middle it's important to remove all the half entered filings where `parse_started` = True and `parse_complete` is not True. (By default it is null, so don't try to match on `parse_complete` = False). 168 | 169 | 170 | -- -------------------------------------------------------------------------------- /sked_l.sh: -------------------------------------------------------------------------------- 1 | 2 | -- Schedule L - Transactions with interested parties 3 | 4 | -- Part I: Excess Benefit Transactions 5 | -- See the repeating group docs [here](http://www.irsx.info/metadata/groups/SkdLDsqlfdPrsnExBnftTr.html) 6 | 7 | 8 | 9 | DROP TABLE IF EXISTS excess_benefits; 10 | 11 | SELECT 12 | address_table."RtrnHdr_TxPrdEndDt", 13 | address_table."RtrnHdr_TxYr", 14 | address_table."BsnssOffcr_SgntrDt", 15 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 16 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 17 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 18 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt", 19 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" , 20 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" , 21 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 22 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 23 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 24 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 25 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 26 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 27 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 28 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 29 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 30 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 31 | return_SkdLDsqlfdPrsnExBnftTr.* 32 | INTO TEMPORARY TABLE excess_benefits 33 | FROM return_SkdLDsqlfdPrsnExBnftTr 34 | LEFT JOIN address_table ON return_SkdLDsqlfdPrsnExBnftTr.object_id = address_table.object_id 35 | AND return_SkdLDsqlfdPrsnExBnftTr.ein = address_table.ein; 36 | 37 | 38 | DROP TABLE IF EXISTS excess_benefits_types; 39 | 40 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, excess_benefits.* into TEMPORARY TABLE excess_benefits_types from excess_benefits left join org_types on excess_benefits.object_id = org_types.object_id; 41 | 42 | \copy excess_benefits_types to '/data/file_exports/excess_benefits.csv' with csv header; 43 | 44 | 45 | 46 | 47 | -- Part II: Loans Between the Organization and Interested Persons 48 | 49 | -- Loans from the org to an insider 50 | -- See the repeating group docs [here](http://www.irsx.info/metadata/groups/SkdLLnsBtwnOrgIntrstdPrsn.html) 51 | 52 | 53 | 54 | DROP TABLE IF EXISTS loans_from; 55 | 56 | SELECT 57 | address_table."RtrnHdr_TxPrdEndDt", 58 | address_table."RtrnHdr_TxYr", 59 | address_table."BsnssOffcr_SgntrDt", 60 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 61 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 62 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 63 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt", 64 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" , 65 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" , 66 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 67 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 68 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 69 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 70 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 71 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 72 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 73 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 74 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 75 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 76 | return_SkdLLnsBtwnOrgIntrstdPrsn.* 77 | 78 | INTO TEMPORARY TABLE loans_from 79 | FROM return_SkdLLnsBtwnOrgIntrstdPrsn 80 | LEFT JOIN address_table ON return_SkdLLnsBtwnOrgIntrstdPrsn.object_id = address_table.object_id 81 | AND return_SkdLLnsBtwnOrgIntrstdPrsn.ein = address_table.ein 82 | WHERE return_SkdLLnsBtwnOrgIntrstdPrsn."LnFrmOrgnztnInd" = 'X'; 83 | 84 | 85 | drop table if exists loans_from_types; 86 | 87 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, loans_from.* into temporary table loans_from_types from loans_from left join org_types on loans_from.object_id = org_types.object_id and loans_from.ein = org_types.ein; 88 | 89 | 90 | \copy loans_from_types to '/data/file_exports/loans_from.csv' with csv header; 91 | 92 | 93 | 94 | -- Loans from an insider to the org 95 | 96 | 97 | 98 | DROP TABLE IF EXISTS loans_to; 99 | 100 | SELECT 101 | address_table."RtrnHdr_TxPrdEndDt", 102 | address_table."RtrnHdr_TxYr", 103 | address_table."BsnssOffcr_SgntrDt", 104 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 105 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 106 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 107 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt", 108 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" , 109 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" , 110 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 111 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 112 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 113 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 114 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 115 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 116 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 117 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 118 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 119 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 120 | return_SkdLLnsBtwnOrgIntrstdPrsn.* 121 | INTO TEMPORARY TABLE loans_to 122 | FROM return_SkdLLnsBtwnOrgIntrstdPrsn 123 | LEFT JOIN address_table ON return_SkdLLnsBtwnOrgIntrstdPrsn.object_id = address_table.object_id 124 | AND return_SkdLLnsBtwnOrgIntrstdPrsn.ein = address_table.ein 125 | WHERE return_SkdLLnsBtwnOrgIntrstdPrsn."LnTOrgnztnInd" = 'X'; 126 | 127 | drop table if exists loans_to_types; 128 | 129 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, loans_to.* into TEMPORARY TABLE loans_to_types from loans_to left join org_types on loans_to.object_id = org_types.object_id and loans_to.ein = org_types.ein; 130 | 131 | 132 | \copy loans_to_types to '/data/file_exports/loans_to.csv' with csv header; 133 | 134 | 135 | -- Part III: Grants or Assistance Benefiting Interested Persons 136 | 137 | -- http://www.irsx.info/metadata/groups/SkdLGrntAsstBnftIntrstdPrsn.html 138 | 139 | DROP TABLE IF EXISTS insider_assistance; 140 | 141 | SELECT 142 | address_table."RtrnHdr_TxPrdEndDt", 143 | address_table."RtrnHdr_TxYr", 144 | address_table."BsnssOffcr_SgntrDt", 145 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 146 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 147 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 148 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt", 149 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" , 150 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" , 151 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 152 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 153 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 154 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 155 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 156 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 157 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 158 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 159 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 160 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 161 | return_SkdLGrntAsstBnftIntrstdPrsn.* 162 | INTO TEMPORARY TABLE insider_assistance 163 | FROM return_SkdLGrntAsstBnftIntrstdPrsn 164 | LEFT JOIN address_table ON return_SkdLGrntAsstBnftIntrstdPrsn.object_id = address_table.object_id 165 | AND return_SkdLGrntAsstBnftIntrstdPrsn.ein = address_table.ein 166 | 167 | 168 | drop table if exists insider_assistance_types; 169 | 170 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, insider_assistance.* into temporary table insider_assistance_types from insider_assistance left join org_types on insider_assistance.object_id = org_types.object_id and insider_assistance.ein = org_types.ein; 171 | 172 | 173 | \copy insider_assistance_types to '/data/file_exports/insider_assistance.csv' with csv header; 174 | 175 | 176 | 177 | -- Part IV: Business Transactions Involving Interested Persons 178 | 179 | 180 | -- http://www.irsx.info/metadata/groups/SkdLBsTrInvlvIntrstdPrsn.html 181 | 182 | DROP TABLE IF EXISTS insider_transactions; 183 | 184 | SELECT 185 | address_table."RtrnHdr_TxPrdEndDt", 186 | address_table."RtrnHdr_TxYr", 187 | address_table."BsnssOffcr_SgntrDt", 188 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1", 189 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21", 190 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm", 191 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt", 192 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" , 193 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" , 194 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt", 195 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt", 196 | address_table."USAddrss_CtyNm" as "Org_CtyNm", 197 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd", 198 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd", 199 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt", 200 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt", 201 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm", 202 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm", 203 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd", 204 | return_SkdLBsTrInvlvIntrstdPrsn.* 205 | INTO TEMPORARY TABLE insider_transactions 206 | FROM return_SkdLBsTrInvlvIntrstdPrsn 207 | LEFT JOIN address_table ON return_SkdLBsTrInvlvIntrstdPrsn.object_id = address_table.object_id 208 | AND return_SkdLBsTrInvlvIntrstdPrsn.ein = address_table.ein; 209 | 210 | drop table if exists insider_transactions_types; 211 | 212 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, insider_transactions.* into temporary table insider_transactions_types from insider_transactions left join org_types on insider_transactions.object_id = org_types.object_id and insider_transactions.ein = org_types.ein; 213 | 214 | 215 | \copy insider_transactions_types to '/data/file_exports/insider_transactions.csv' with csv header; 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /irsdb/dump_from_manifest.py: -------------------------------------------------------------------------------- 1 | 2 | import unicodecsv as csv 3 | from irsx.xmlrunner import XMLRunner 4 | 5 | from irsx.filing import FileMissingException 6 | from stream_extractor import StreamExtractor 7 | 8 | 9 | 10 | output_streams = { 11 | 12 | '990_part_0': { 13 | 'filename':'990_part_0', 14 | 'headers': ["ein", "object_id", 'Orgnztn527Ind', 'Orgnztn501cInd', 'Orgnztn49471NtPFInd', 'Orgnztn501c3Ind', 'WbstAddrssTxt', 'OfOrgnztnTrstInd', 'OthrOrgnztnDsc', 'OfOrgnztnCrpInd', 'OfOrgnztnOthrInd', 'OfOrgnztnAsscInd', 'FrmtnYr', 'LglDmclSttCd', 'LglDmclCntryCd'] 15 | }, 16 | '990_part_i': { 17 | 'filename':'990_part_i', 18 | 'headers': ["ein", "object_id", "CntrctTrmntnInd", "TtlEmplyCnt", "TtlVlntrsCnt", "CYInvstmntIncmAmt", "CYTtlRvnAmt", "CYTtlExpnssAmt", "CYRvnsLssExpnssAmt", "TtlAsstsEOYAmt", "ActvtyOrMssnDsc" ] 19 | }, 20 | '990_part_iv': { 21 | 'filename':'990_part_iv', 22 | 'headers': ["ein", "object_id", "PrtlLqdtnInd"] 23 | }, 24 | '990ez_part_0': { 25 | 'filename':'990ez_part_0', 26 | 'headers': ["ein", "object_id", "WbstAddrssTxt", "Orgnztn527Ind", "Orgnztn501c3Ind", "Orgnztn49471NtPFInd", "Orgnztn501cInd", "OfOrgnztnOthrDsc", "OfOrgnztnOthrInd", "OfOrgnztnCrpInd", "OfOrgnztnTrstInd", "OfOrgnztnAsscInd", "GrssRcptsAmt"] 27 | }, 28 | '990ez_part_i': { 29 | 'filename':'990ez_part_i', 30 | 'headers': ["ein", "object_id", "TtlExpnssAmt", "TtlRvnAmt"] 31 | }, 32 | '990pf_part_0': { 33 | 'filename':'990pf_part_0', 34 | 'headers': ["ein", "object_id","PFSttsTrmSct507b1AInd", "Orgnztn501c3TxblPFInd", "Orgnztn501c3ExmptPFInd", "Orgnztn49471TrtdPFInd", "FMVAsstsEOYAmt"] 35 | }, 36 | '990pf_part_i': { 37 | 'filename':'990pf_part_i', 38 | 'headers': ["ein", "object_id", 'TtlRvAndExpnssAmt', 'CmpOfcrDrTrstRvAndExpnssAmt', 'OthEmplSlrsWgsRvAndExpnssAmt', 'TtOprExpnssRvAndExpnssAmt', 'CntrPdRvAndExpnssAmt', 'TtlExpnssRvAndExpnssAmt'] 39 | }, 40 | '990pf_part_viia': { 41 | 'filename':'990pf_part_viia', 42 | 'headers': ["ein", "object_id", "SttmntsRgrdngActy_WbstAddrssTxt"] 43 | }, 44 | 'employees_990': { 45 | 'filename':'employees_990', # will output to employees_detailedYYYY.csv where year is specified below 46 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"] 47 | }, 48 | 'employees_990PF': { 49 | 'filename':'employees_990PF', # will output to employees_detailedYYYY.csv where year is specified below 50 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"] 51 | }, 52 | 'employees_990EZ': { 53 | 'filename':'employees_990EZ', # will output to employees_detailedYYYY.csv where year is specified below 54 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"] 55 | }, 56 | 'header_metadata': { 57 | 'filename':'header_metadata', # will output to employees_detailedYYYY.csv where year is specified below 58 | 'headers':["ein", "object_id", "BsnssNm_BsnssNmLn1Txt", "BsnssNm_BsnssNmLn2Txt", "USAddrss_AddrssLn1Txt", "USAddrss_AddrssLn2Txt", "USAddrss_CtyNm", "USAddrss_SttAbbrvtnCd", "RtrnHdr_TxPrdBgnDt", "RtrnHdr_TxPrdEndDt", "BsnssOffcr_SgntrDt", "Flr_PhnNm", "RtrnHdr_RtrnTs"] 59 | } 60 | , 61 | '990L_loans': { 62 | 'filename':'990L_loans', # will output to employees_detailedYYYY.csv where year is specified below 63 | 'headers':[ 'ein', 'object_id', 'BsnssNmLn1Txt', 'BsnssNmLn2Txt', 'PrsnNm', 'RltnshpWthOrgTxt', 'LnPrpsTxt', 'LnFrmOrgnztnInd', 'LnTOrgnztnInd', 'OrgnlPrncplAmt', 'BlncDAmt', 'DfltInd', 'BrdOrCmmttApprvlInd', 'WrttnAgrmntInd'] 64 | } 65 | , 66 | '990L_grants': { 67 | 'filename':'990L_grants', # will output to employees_detailedYYYY.csv where year is specified below 68 | 'headers':["ein", "object_id", "PrsnNm", "BsnssNmLn1Txt", "BsnssNmLn2Txt", "RltnshpWthOrgTxt", "CshGrntAmt", "OfAssstncTxt", "AssstncPrpsTxt"] 69 | }, 70 | '990L_trans': { 71 | 'filename':'990L_trans', # will output to employees_detailedYYYY.csv where year is specified below 72 | 'headers':["ein", "object_id", "BsnssNmLn1Txt", "PrsnNm", "BsnssNmLn2Txt", "RltnshpDscrptnTxt", "TrnsctnAmt", "TrnsctnDsc", "ShrngOfRvnsInd"] 73 | } 74 | # 'diversions': { 75 | # 'filename':'diversions', # will output to diversionsYYYY.csv where year is specified below 76 | # 'headers':["year", "ein", "object_id", "taxpayer_name", "diversion_ind"] 77 | # } 78 | } 79 | 80 | data_capture_dict = { 81 | 'IRS990': { 82 | 'parts': { 83 | 'part_0': { 84 | 'stream_key': '990_part_0', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 85 | 'ein': {'header':'ein'}, 86 | 'object_id': {'header':'object_id'}, 87 | 'Orgnztn527Ind':{'header':'Orgnztn527Ind'}, 88 | 'Orgnztn501cInd':{'header':'Orgnztn501cInd'}, 89 | 'Orgnztn49471NtPFInd':{'header':'Orgnztn49471NtPFInd'}, 90 | 'Orgnztn501c3Ind' :{'header':'Orgnztn501c3Ind'}, 91 | 'WbstAddrssTxt' :{'header':'WbstAddrssTxt'}, 92 | 'OfOrgnztnTrstInd' :{'header':'OfOrgnztnTrstInd'}, 93 | 'OthrOrgnztnDsc' :{'header':'OthrOrgnztnDsc'}, 94 | 'OfOrgnztnCrpInd' :{'header':'OfOrgnztnCrpInd'}, 95 | 'OfOrgnztnOthrInd' :{'header':'OfOrgnztnOthrInd'}, 96 | 'OfOrgnztnAsscInd' :{'header':'OfOrgnztnAsscInd'}, 97 | 'FrmtnYr' :{'header':'FrmtnYr'}, 98 | 'LglDmclSttCd' :{'header':'LglDmclSttCd'}, 99 | 'LglDmclCntryCd' :{'header':'LglDmclCntryCd'}, 100 | }, 101 | 'part_i': { 102 | 'stream_key': '990_part_i', 103 | 'ein': {'header':'ein'}, 104 | 'object_id': {'header':'object_id'}, 105 | 'CntrctTrmntnInd': {'header': "CntrctTrmntnInd"}, 106 | 'ActvtyOrMssnDsc': {'header': "ActvtyOrMssnDsc"}, 107 | 'TtlEmplyCnt': {'header': "TtlEmplyCnt"}, 108 | 'TtlVlntrsCnt': {'header': "TtlVlntrsCnt"}, 109 | 'CYInvstmntIncmAmt': {'header': "CYInvstmntIncmAmt"}, 110 | 'CYTtlRvnAmt': {'header': "CYTtlRvnAmt"}, 111 | 'CYTtlExpnssAmt': {'header': "CYTtlExpnssAmt"}, 112 | 'CYRvnsLssExpnssAmt': {'header': "CYRvnsLssExpnssAmt"}, 113 | 'TtlAsstsEOYAmt': {'header': "TtlAsstsEOYAmt"} 114 | }, 115 | 'part_iv': { 116 | "stream_key": '990_part_iv', 117 | "ein": {'header': "ein"}, 118 | "object_id": {'header': "object_id"}, 119 | "PrtlLqdtnInd": {'header': "PrtlLqdtnInd"} 120 | } 121 | }, 122 | ## The remaining logic is for capturing salaries wherever they appear in 123 | ## the 990, 990PF and 990EZ 124 | 'groups': { 125 | 'Frm990PrtVIISctnA': { 126 | 'stream_key': 'employees_990', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 127 | 'ein': {'header':'ein'}, 128 | 'object_id': {'header':'object_id'}, 129 | 'PrsnNm': {'header':'name'}, 130 | 'BsnssNmLn1Txt':{'header':'business_name1'}, 131 | 'BsnssNmLn2Txt':{'header':'business_name2'}, 132 | 'TtlTxt': {'header':'title'}, 133 | 'RprtblCmpFrmOrgAmt': { 134 | 'header':'org_comp', 135 | 'default':0 # set numeric if missing 136 | }, 137 | 'RprtblCmpFrmRltdOrgAmt': { 138 | 'header':'related_comp', 139 | 'default':0 140 | }, 141 | 'OthrCmpnstnAmt':{ 142 | 'header':'other_cmp', 143 | 'default':0 144 | }, 145 | 'IndvdlTrstOrDrctrInd':{'header':'IndvdlTrstOrDrctrInd'}, 146 | 'InstttnlTrstInd':{'header':'InstttnlTrstInd'}, 147 | 'OffcrInd':{'header':'OffcrInd'}, 148 | 'KyEmplyInd':{'header':'KyEmplyInd'}, 149 | 'HghstCmpnstdEmplyInd':{'header':'HghstCmpnstdEmplyInd'}, 150 | 'FrmrOfcrDrctrTrstInd':{'header':'FrmrOfcrDrctrTrstInd'} 151 | } 152 | } 153 | }, 154 | 'IRS990EZ': { 155 | 'parts': { 156 | 'ez_part_0':{ 157 | 'stream_key': '990ez_part_0', 158 | 'ein': {'header':'ein'}, 159 | 'object_id': {'header':'object_id'}, 160 | "WbstAddrssTxt": {'header':'WbstAddrssTxt'}, 161 | "Orgnztn527Ind": {'header':'Orgnztn527Ind'}, 162 | "Orgnztn501c3Ind": {'header':'Orgnztn501c3Ind'}, 163 | "Orgnztn49471NtPFInd": {'header':'Orgnztn49471NtPFInd'}, 164 | "Orgnztn501cInd": {'header':'Orgnztn501cInd'}, 165 | "OfOrgnztnOthrDsc": {'header':'OfOrgnztnOthrDsc'}, 166 | "OfOrgnztnOthrInd": {'header':'OfOrgnztnOthrInd'}, 167 | "OfOrgnztnCrpInd": {'header':'OfOrgnztnCrpInd'}, 168 | "OfOrgnztnTrstInd": {'header':'OfOrgnztnTrstInd'}, 169 | "OfOrgnztnAsscInd": {'header':'OfOrgnztnAsscInd'}, 170 | "GrssRcptsAmt": {'header':'GrssRcptsAmt'} 171 | }, 172 | 'ez_part_i': { 173 | 'stream_key': '990ez_part_i', 174 | 'ein': {'header':'ein'}, 175 | 'object_id': {'header':'object_id'}, 176 | "TtlExpnssAmt": {'header':'TtlExpnssAmt'}, 177 | "TtlRvnAmt": {'header':'TtlRvnAmt'}, 178 | } 179 | }, 180 | 'groups': { 181 | 'EZOffcrDrctrTrstEmpl': { 182 | 'stream_key': 'employees_990EZ', 183 | 'ein': {'header':'ein'}, 184 | 'object_id': {'header':'object_id'}, 185 | 'PrsnNm': {'header':'name'}, 186 | 'BsnssNmLn1': {'header':'business_name1'}, 187 | 'BsnssNmLn2': {'header':'business_name2'}, 188 | 189 | 190 | 'TtlTxt': {'header':'title'}, 191 | 'CmpnstnAmt': { 192 | 'header':'org_comp', 193 | 'default':0 194 | }, 195 | 'composite': { # other compensation includes benefits and other allowances for EZ, PF filers 196 | 'other_cmp': { 197 | 'EmplyBnftPrgrmAmt': { 198 | 'default':0 199 | }, 200 | 'ExpnsAccntOthrAllwncAmt': { 201 | 'default':0 202 | } 203 | } 204 | } 205 | }, 206 | 'EZCmpnstnHghstPdEmpl': { 207 | 'stream_key': 'employees_990EZ', 208 | 'ein': {'header':'ein'}, 209 | 'object_id': {'header':'object_id'}, 210 | 'PrsnNm': {'header':'name'}, 211 | 'TtlTxt': {'header':'title'}, 212 | 'CmpnstnAmt': { 213 | 'header':'org_comp', 214 | 'default':0 215 | }, 216 | 'composite': { 217 | 'other_cmp': { 218 | 'EmplyBnftsAmt': { 219 | 'default':0 220 | }, 221 | 'ExpnsAccntAmt': { 222 | 'default':0 223 | } 224 | } 225 | } 226 | } 227 | } 228 | }, 229 | 'ReturnHeader990x': { 230 | 'parts': { 231 | 'returnheader990x_part_i': { 232 | 'stream_key': 'header_metadata', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 233 | 'ein': {'header':'ein'}, 234 | 'object_id': {'header':'object_id'}, 235 | 'RtrnHdr_TxYr':{'header':'RtrnHdr_TxYr'}, 236 | 'BsnssNm_BsnssNmLn2Txt': {'header':'BsnssNm_BsnssNmLn2Txt'}, 237 | 'BsnssNm_BsnssNmLn1Txt': {'header':'BsnssNm_BsnssNmLn1Txt'}, 238 | 'USAddrss_AddrssLn1Txt': {'header':'USAddrss_AddrssLn1Txt'}, 239 | 'USAddrss_AddrssLn2Txt': {'header':'USAddrss_AddrssLn2Txt'}, 240 | 'USAddrss_CtyNm': {'header':'USAddrss_CtyNm'}, 241 | 'USAddrss_SttAbbrvtnCd': {'header':'USAddrss_SttAbbrvtnCd'}, 242 | 'RtrnHdr_TxPrdBgnDt': {'header':'RtrnHdr_TxPrdBgnDt'}, 243 | 'RtrnHdr_TxPrdEndDt': {'header':'RtrnHdr_TxPrdEndDt'}, 244 | 'BsnssOffcr_SgntrDt': {'header': 'BsnssOffcr_SgntrDt'}, 245 | 'Flr_PhnNm': {'header': 'Flr_PhnNm'}, 246 | 'RtrnHdr_RtrnTs': {'header': 'RtrnHdr_RtrnTs'} 247 | } 248 | } 249 | }, 250 | 'IRS990ScheduleL': { 251 | 'parts': { 252 | }, 253 | 'groups': { 254 | 'SkdLLnsBtwnOrgIntrstdPrsn': { 255 | 'stream_key': '990L_loans', 256 | 'ein': {'header':'ein'}, 257 | 'object_id': {'header':'object_id'}, 258 | 'BsnssNmLn1Txt': {'header':'BsnssNmLn1Txt'}, 259 | 'BsnssNmLn2Txt': {'header':'BsnssNmLn2Txt'}, 260 | 'PrsnNm': {'header':'PrsnNm'}, 261 | 'RltnshpWthOrgTxt': {'header':'RltnshpWthOrgTxt'}, 262 | 'LnPrpsTxt': {'header':'LnPrpsTxt'}, 263 | 'LnFrmOrgnztnInd': {'header':'LnFrmOrgnztnInd'}, 264 | 'LnTOrgnztnInd': {'header':'LnTOrgnztnInd'}, 265 | 'OrgnlPrncplAmt': {'header':'OrgnlPrncplAmt'}, 266 | 'BlncDAmt': {'header':'BlncDAmt'}, 267 | 'DfltInd': {'header':'DfltInd'}, 268 | 'BrdOrCmmttApprvlInd': {'header':'BrdOrCmmttApprvlInd'}, 269 | 'WrttnAgrmntInd': {'header':'WrttnAgrmntInd'} 270 | }, 271 | 'SkdLGrntAsstBnftIntrstdPrsn': { 272 | 'stream_key': '990L_grants', 273 | 'ein': {'header':'ein'}, 274 | 'object_id': {'header':'object_id'}, 275 | "PrsnNm": {'header':'PrsnNm'}, 276 | "BsnssNmLn1Txt": {'header':'BsnssNmLn1Txt'}, 277 | "BsnssNmLn2Txt": {'header':'BsnssNmLn2Txt'}, 278 | "RltnshpWthOrgTxt": {'header':'RltnshpWthOrgTxt'}, 279 | "CshGrntAmt": {'header':'CshGrntAmt'}, 280 | "OfAssstncTxt": {'header':'OfAssstncTxt'}, 281 | "AssstncPrpsTxt": {'header':'AssstncPrpsTxt'}, 282 | }, 283 | 'SkdLBsTrInvlvIntrstdPrsn': { 284 | 'stream_key': '990L_trans', 285 | 'ein': {'header':'ein'}, 286 | 'object_id': {'header':'object_id'}, 287 | "BsnssNmLn1Txt": {'header':'BsnssNmLn1Txt'}, 288 | "PrsnNm": {'header':'PrsnNm'}, 289 | "BsnssNmLn2Txt": {'header':'BsnssNmLn2Txt'}, 290 | "RltnshpDscrptnTxt": {'header':'RltnshpDscrptnTxt'}, 291 | "TrnsctnAmt": {'header':'TrnsctnAmt'}, 292 | "TrnsctnDsc": {'header':'TrnsctnDsc'}, 293 | "ShrngOfRvnsInd": {'header':'ShrngOfRvnsInd'} 294 | } 295 | } 296 | }, 297 | 298 | 'IRS990PF': { 299 | 'parts': { 300 | 'pf_part_0': { 301 | 'stream_key': '990pf_part_0', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 302 | 'ein': {'header':'ein'}, 303 | 'object_id': {'header':'object_id'}, 304 | "PFSttsTrmSct507b1AInd": {'header':'PFSttsTrmSct507b1AInd'}, 305 | "Orgnztn501c3TxblPFInd": {'header':'Orgnztn501c3TxblPFInd'}, 306 | "Orgnztn501c3ExmptPFInd": {'header':'Orgnztn501c3ExmptPFInd'}, 307 | "Orgnztn49471TrtdPFInd": {'header':'Orgnztn49471TrtdPFInd'}, 308 | "FMVAsstsEOYAmt": {'header':'FMVAsstsEOYAmt'}, 309 | }, 310 | 'pf_part_i': { 311 | 'stream_key': '990pf_part_i', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 312 | 'ein': {'header':'ein'}, 313 | 'object_id': {'header':'object_id'}, 314 | 'TtlRvAndExpnssAmt': {'header':'TtlRvAndExpnssAmt'}, 315 | 'CmpOfcrDrTrstRvAndExpnssAmt': {'header':'CmpOfcrDrTrstRvAndExpnssAmt'}, 316 | 'OthEmplSlrsWgsRvAndExpnssAmt': {'header':'OthEmplSlrsWgsRvAndExpnssAmt'}, 317 | 'TtOprExpnssRvAndExpnssAmt': {'header':'TtOprExpnssRvAndExpnssAmt'}, 318 | 'CntrPdRvAndExpnssAmt': {'header':'CntrPdRvAndExpnssAmt'}, 319 | 'TtlExpnssRvAndExpnssAmt': {'header':'TtlExpnssRvAndExpnssAmt'} 320 | }, 321 | 'pf_part_viia': { 322 | 'stream_key': '990pf_part_viia', # 'stream_key' specifies where the output goes--must exist as a key in output_streams 323 | 'ein': {'header':'ein'}, 324 | 'object_id': {'header':'object_id'}, 325 | 'SttmntsRgrdngActy_WbstAddrssTxt': {'header':'SttmntsRgrdngActy_WbstAddrssTxt'} 326 | } 327 | }, 328 | 'groups': { 329 | 'PFOffcrDrTrstKyEmpl': { 330 | 'stream_key': 'employees_990PF', 331 | 332 | 'ein': {'header':'ein'}, 333 | 'object_id': {'header':'object_id'}, 334 | 'OffcrDrTrstKyEmpl_PrsnNm': {'header':'name'}, 335 | 'OffcrDrTrstKyEmpl_BsnssNmLn1': {'header':'business_name1'}, 336 | 'OffcrDrTrstKyEmpl_BsnssNmLn2': {'header':'business_name2'}, 337 | 'OffcrDrTrstKyEmpl_TtlTxt': {'header':'title'}, 338 | 'OffcrDrTrstKyEmpl_CmpnstnAmt': { 339 | 'header':'org_comp', 340 | 'default':0 # set numeric if missing 341 | }, 342 | 'composite': { 343 | 'other_cmp': { 344 | 'OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt': { 345 | 'default':0 346 | }, 347 | 'OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt': { 348 | 'default':0 349 | } 350 | } 351 | } 352 | }, 353 | 'PFCmpnstnHghstPdEmpl': { 354 | 'stream_key': 'employees_990PF', 355 | 356 | 'ein': {'header':'ein'}, 357 | 'object_id': {'header':'object_id'}, 358 | 'CmpnstnHghstPdEmpl_PrsnNm': {'header':'name'}, 359 | 'CmpnstnHghstPdEmpl_TtlTxt': {'header':'title'}, 360 | 'CmpnstnHghstPdEmpl_CmpnstnAmt': { 361 | 'header':'org_comp', 362 | 'default':0 # set numeric if missing 363 | }, 364 | 'composite': { 365 | 'other_cmp': { 366 | 'CmpnstnHghstPdEmpl_EmplyBnftsAmt': { 367 | 'default':0 368 | }, 369 | 'CmpnstnHghstPdEmpl_ExpnsAccntAmt': { 370 | 'default':0 371 | } 372 | } 373 | } 374 | } 375 | } 376 | } 377 | } 378 | 379 | if __name__ == '__main__': 380 | 381 | input_file = "initial_manifest.csv" 382 | 383 | 384 | 385 | # read the whole file in here, it's not very long 386 | file_rows = [] 387 | 388 | # We're using the output of part 1 389 | with open(input_file, 'rb') as infile: 390 | reader = csv.DictReader(infile) 391 | for row in reader: 392 | file_rows.append(row) 393 | 394 | 395 | extractor = StreamExtractor(output_streams, data_capture_dict) 396 | 397 | 398 | filing_count = 0 399 | for metadata_row in file_rows: 400 | 401 | try: 402 | object_id = metadata_row['object_id'] 403 | if object_id: 404 | #print("Running %s " % metadata_row['object_id']) 405 | extractor.run_filing(object_id, taxpayer_name=metadata_row['name']) 406 | 407 | filing_count += 1 408 | if filing_count % 100 == 0: 409 | print("Processed %s filings" % filing_count) 410 | 411 | 412 | except FileMissingException: 413 | pass 414 | #print("Missing %s skipping " % metadata_row['object_id']) 415 | 416 | 417 | -------------------------------------------------------------------------------- /irsdb/static/js/bootstrap.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap v3.3.7 (http://getbootstrap.com) 3 | * Copyright 2011-2016 Twitter, Inc. 4 | * Licensed under the MIT license 5 | */ 6 | if("undefined"==typeof jQuery)throw new Error("Bootstrap's JavaScript requires jQuery");+function(a){"use strict";var b=a.fn.jquery.split(" ")[0].split(".");if(b[0]<2&&b[1]<9||1==b[0]&&9==b[1]&&b[2]<1||b[0]>3)throw new Error("Bootstrap's JavaScript requires jQuery version 1.9.1 or higher, but lower than version 4")}(jQuery),+function(a){"use strict";function b(){var a=document.createElement("bootstrap"),b={WebkitTransition:"webkitTransitionEnd",MozTransition:"transitionend",OTransition:"oTransitionEnd otransitionend",transition:"transitionend"};for(var c in b)if(void 0!==a.style[c])return{end:b[c]};return!1}a.fn.emulateTransitionEnd=function(b){var c=!1,d=this;a(this).one("bsTransitionEnd",function(){c=!0});var e=function(){c||a(d).trigger(a.support.transition.end)};return setTimeout(e,b),this},a(function(){a.support.transition=b(),a.support.transition&&(a.event.special.bsTransitionEnd={bindType:a.support.transition.end,delegateType:a.support.transition.end,handle:function(b){if(a(b.target).is(this))return b.handleObj.handler.apply(this,arguments)}})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var c=a(this),e=c.data("bs.alert");e||c.data("bs.alert",e=new d(this)),"string"==typeof b&&e[b].call(c)})}var c='[data-dismiss="alert"]',d=function(b){a(b).on("click",c,this.close)};d.VERSION="3.3.7",d.TRANSITION_DURATION=150,d.prototype.close=function(b){function c(){g.detach().trigger("closed.bs.alert").remove()}var e=a(this),f=e.attr("data-target");f||(f=e.attr("href"),f=f&&f.replace(/.*(?=#[^\s]*$)/,""));var g=a("#"===f?[]:f);b&&b.preventDefault(),g.length||(g=e.closest(".alert")),g.trigger(b=a.Event("close.bs.alert")),b.isDefaultPrevented()||(g.removeClass("in"),a.support.transition&&g.hasClass("fade")?g.one("bsTransitionEnd",c).emulateTransitionEnd(d.TRANSITION_DURATION):c())};var e=a.fn.alert;a.fn.alert=b,a.fn.alert.Constructor=d,a.fn.alert.noConflict=function(){return a.fn.alert=e,this},a(document).on("click.bs.alert.data-api",c,d.prototype.close)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.button"),f="object"==typeof b&&b;e||d.data("bs.button",e=new c(this,f)),"toggle"==b?e.toggle():b&&e.setState(b)})}var c=function(b,d){this.$element=a(b),this.options=a.extend({},c.DEFAULTS,d),this.isLoading=!1};c.VERSION="3.3.7",c.DEFAULTS={loadingText:"loading..."},c.prototype.setState=function(b){var c="disabled",d=this.$element,e=d.is("input")?"val":"html",f=d.data();b+="Text",null==f.resetText&&d.data("resetText",d[e]()),setTimeout(a.proxy(function(){d[e](null==f[b]?this.options[b]:f[b]),"loadingText"==b?(this.isLoading=!0,d.addClass(c).attr(c,c).prop(c,!0)):this.isLoading&&(this.isLoading=!1,d.removeClass(c).removeAttr(c).prop(c,!1))},this),0)},c.prototype.toggle=function(){var a=!0,b=this.$element.closest('[data-toggle="buttons"]');if(b.length){var c=this.$element.find("input");"radio"==c.prop("type")?(c.prop("checked")&&(a=!1),b.find(".active").removeClass("active"),this.$element.addClass("active")):"checkbox"==c.prop("type")&&(c.prop("checked")!==this.$element.hasClass("active")&&(a=!1),this.$element.toggleClass("active")),c.prop("checked",this.$element.hasClass("active")),a&&c.trigger("change")}else this.$element.attr("aria-pressed",!this.$element.hasClass("active")),this.$element.toggleClass("active")};var d=a.fn.button;a.fn.button=b,a.fn.button.Constructor=c,a.fn.button.noConflict=function(){return a.fn.button=d,this},a(document).on("click.bs.button.data-api",'[data-toggle^="button"]',function(c){var d=a(c.target).closest(".btn");b.call(d,"toggle"),a(c.target).is('input[type="radio"], input[type="checkbox"]')||(c.preventDefault(),d.is("input,button")?d.trigger("focus"):d.find("input:visible,button:visible").first().trigger("focus"))}).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',function(b){a(b.target).closest(".btn").toggleClass("focus",/^focus(in)?$/.test(b.type))})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.carousel"),f=a.extend({},c.DEFAULTS,d.data(),"object"==typeof b&&b),g="string"==typeof b?b:f.slide;e||d.data("bs.carousel",e=new c(this,f)),"number"==typeof b?e.to(b):g?e[g]():f.interval&&e.pause().cycle()})}var c=function(b,c){this.$element=a(b),this.$indicators=this.$element.find(".carousel-indicators"),this.options=c,this.paused=null,this.sliding=null,this.interval=null,this.$active=null,this.$items=null,this.options.keyboard&&this.$element.on("keydown.bs.carousel",a.proxy(this.keydown,this)),"hover"==this.options.pause&&!("ontouchstart"in document.documentElement)&&this.$element.on("mouseenter.bs.carousel",a.proxy(this.pause,this)).on("mouseleave.bs.carousel",a.proxy(this.cycle,this))};c.VERSION="3.3.7",c.TRANSITION_DURATION=600,c.DEFAULTS={interval:5e3,pause:"hover",wrap:!0,keyboard:!0},c.prototype.keydown=function(a){if(!/input|textarea/i.test(a.target.tagName)){switch(a.which){case 37:this.prev();break;case 39:this.next();break;default:return}a.preventDefault()}},c.prototype.cycle=function(b){return b||(this.paused=!1),this.interval&&clearInterval(this.interval),this.options.interval&&!this.paused&&(this.interval=setInterval(a.proxy(this.next,this),this.options.interval)),this},c.prototype.getItemIndex=function(a){return this.$items=a.parent().children(".item"),this.$items.index(a||this.$active)},c.prototype.getItemForDirection=function(a,b){var c=this.getItemIndex(b),d="prev"==a&&0===c||"next"==a&&c==this.$items.length-1;if(d&&!this.options.wrap)return b;var e="prev"==a?-1:1,f=(c+e)%this.$items.length;return this.$items.eq(f)},c.prototype.to=function(a){var b=this,c=this.getItemIndex(this.$active=this.$element.find(".item.active"));if(!(a>this.$items.length-1||a<0))return this.sliding?this.$element.one("slid.bs.carousel",function(){b.to(a)}):c==a?this.pause().cycle():this.slide(a>c?"next":"prev",this.$items.eq(a))},c.prototype.pause=function(b){return b||(this.paused=!0),this.$element.find(".next, .prev").length&&a.support.transition&&(this.$element.trigger(a.support.transition.end),this.cycle(!0)),this.interval=clearInterval(this.interval),this},c.prototype.next=function(){if(!this.sliding)return this.slide("next")},c.prototype.prev=function(){if(!this.sliding)return this.slide("prev")},c.prototype.slide=function(b,d){var e=this.$element.find(".item.active"),f=d||this.getItemForDirection(b,e),g=this.interval,h="next"==b?"left":"right",i=this;if(f.hasClass("active"))return this.sliding=!1;var j=f[0],k=a.Event("slide.bs.carousel",{relatedTarget:j,direction:h});if(this.$element.trigger(k),!k.isDefaultPrevented()){if(this.sliding=!0,g&&this.pause(),this.$indicators.length){this.$indicators.find(".active").removeClass("active");var l=a(this.$indicators.children()[this.getItemIndex(f)]);l&&l.addClass("active")}var m=a.Event("slid.bs.carousel",{relatedTarget:j,direction:h});return a.support.transition&&this.$element.hasClass("slide")?(f.addClass(b),f[0].offsetWidth,e.addClass(h),f.addClass(h),e.one("bsTransitionEnd",function(){f.removeClass([b,h].join(" ")).addClass("active"),e.removeClass(["active",h].join(" ")),i.sliding=!1,setTimeout(function(){i.$element.trigger(m)},0)}).emulateTransitionEnd(c.TRANSITION_DURATION)):(e.removeClass("active"),f.addClass("active"),this.sliding=!1,this.$element.trigger(m)),g&&this.cycle(),this}};var d=a.fn.carousel;a.fn.carousel=b,a.fn.carousel.Constructor=c,a.fn.carousel.noConflict=function(){return a.fn.carousel=d,this};var e=function(c){var d,e=a(this),f=a(e.attr("data-target")||(d=e.attr("href"))&&d.replace(/.*(?=#[^\s]+$)/,""));if(f.hasClass("carousel")){var g=a.extend({},f.data(),e.data()),h=e.attr("data-slide-to");h&&(g.interval=!1),b.call(f,g),h&&f.data("bs.carousel").to(h),c.preventDefault()}};a(document).on("click.bs.carousel.data-api","[data-slide]",e).on("click.bs.carousel.data-api","[data-slide-to]",e),a(window).on("load",function(){a('[data-ride="carousel"]').each(function(){var c=a(this);b.call(c,c.data())})})}(jQuery),+function(a){"use strict";function b(b){var c,d=b.attr("data-target")||(c=b.attr("href"))&&c.replace(/.*(?=#[^\s]+$)/,"");return a(d)}function c(b){return this.each(function(){var c=a(this),e=c.data("bs.collapse"),f=a.extend({},d.DEFAULTS,c.data(),"object"==typeof b&&b);!e&&f.toggle&&/show|hide/.test(b)&&(f.toggle=!1),e||c.data("bs.collapse",e=new d(this,f)),"string"==typeof b&&e[b]()})}var d=function(b,c){this.$element=a(b),this.options=a.extend({},d.DEFAULTS,c),this.$trigger=a('[data-toggle="collapse"][href="#'+b.id+'"],[data-toggle="collapse"][data-target="#'+b.id+'"]'),this.transitioning=null,this.options.parent?this.$parent=this.getParent():this.addAriaAndCollapsedClass(this.$element,this.$trigger),this.options.toggle&&this.toggle()};d.VERSION="3.3.7",d.TRANSITION_DURATION=350,d.DEFAULTS={toggle:!0},d.prototype.dimension=function(){var a=this.$element.hasClass("width");return a?"width":"height"},d.prototype.show=function(){if(!this.transitioning&&!this.$element.hasClass("in")){var b,e=this.$parent&&this.$parent.children(".panel").children(".in, .collapsing");if(!(e&&e.length&&(b=e.data("bs.collapse"),b&&b.transitioning))){var f=a.Event("show.bs.collapse");if(this.$element.trigger(f),!f.isDefaultPrevented()){e&&e.length&&(c.call(e,"hide"),b||e.data("bs.collapse",null));var g=this.dimension();this.$element.removeClass("collapse").addClass("collapsing")[g](0).attr("aria-expanded",!0),this.$trigger.removeClass("collapsed").attr("aria-expanded",!0),this.transitioning=1;var h=function(){this.$element.removeClass("collapsing").addClass("collapse in")[g](""),this.transitioning=0,this.$element.trigger("shown.bs.collapse")};if(!a.support.transition)return h.call(this);var i=a.camelCase(["scroll",g].join("-"));this.$element.one("bsTransitionEnd",a.proxy(h,this)).emulateTransitionEnd(d.TRANSITION_DURATION)[g](this.$element[0][i])}}}},d.prototype.hide=function(){if(!this.transitioning&&this.$element.hasClass("in")){var b=a.Event("hide.bs.collapse");if(this.$element.trigger(b),!b.isDefaultPrevented()){var c=this.dimension();this.$element[c](this.$element[c]())[0].offsetHeight,this.$element.addClass("collapsing").removeClass("collapse in").attr("aria-expanded",!1),this.$trigger.addClass("collapsed").attr("aria-expanded",!1),this.transitioning=1;var e=function(){this.transitioning=0,this.$element.removeClass("collapsing").addClass("collapse").trigger("hidden.bs.collapse")};return a.support.transition?void this.$element[c](0).one("bsTransitionEnd",a.proxy(e,this)).emulateTransitionEnd(d.TRANSITION_DURATION):e.call(this)}}},d.prototype.toggle=function(){this[this.$element.hasClass("in")?"hide":"show"]()},d.prototype.getParent=function(){return a(this.options.parent).find('[data-toggle="collapse"][data-parent="'+this.options.parent+'"]').each(a.proxy(function(c,d){var e=a(d);this.addAriaAndCollapsedClass(b(e),e)},this)).end()},d.prototype.addAriaAndCollapsedClass=function(a,b){var c=a.hasClass("in");a.attr("aria-expanded",c),b.toggleClass("collapsed",!c).attr("aria-expanded",c)};var e=a.fn.collapse;a.fn.collapse=c,a.fn.collapse.Constructor=d,a.fn.collapse.noConflict=function(){return a.fn.collapse=e,this},a(document).on("click.bs.collapse.data-api",'[data-toggle="collapse"]',function(d){var e=a(this);e.attr("data-target")||d.preventDefault();var f=b(e),g=f.data("bs.collapse"),h=g?"toggle":e.data();c.call(f,h)})}(jQuery),+function(a){"use strict";function b(b){var c=b.attr("data-target");c||(c=b.attr("href"),c=c&&/#[A-Za-z]/.test(c)&&c.replace(/.*(?=#[^\s]*$)/,""));var d=c&&a(c);return d&&d.length?d:b.parent()}function c(c){c&&3===c.which||(a(e).remove(),a(f).each(function(){var d=a(this),e=b(d),f={relatedTarget:this};e.hasClass("open")&&(c&&"click"==c.type&&/input|textarea/i.test(c.target.tagName)&&a.contains(e[0],c.target)||(e.trigger(c=a.Event("hide.bs.dropdown",f)),c.isDefaultPrevented()||(d.attr("aria-expanded","false"),e.removeClass("open").trigger(a.Event("hidden.bs.dropdown",f)))))}))}function d(b){return this.each(function(){var c=a(this),d=c.data("bs.dropdown");d||c.data("bs.dropdown",d=new g(this)),"string"==typeof b&&d[b].call(c)})}var e=".dropdown-backdrop",f='[data-toggle="dropdown"]',g=function(b){a(b).on("click.bs.dropdown",this.toggle)};g.VERSION="3.3.7",g.prototype.toggle=function(d){var e=a(this);if(!e.is(".disabled, :disabled")){var f=b(e),g=f.hasClass("open");if(c(),!g){"ontouchstart"in document.documentElement&&!f.closest(".navbar-nav").length&&a(document.createElement("div")).addClass("dropdown-backdrop").insertAfter(a(this)).on("click",c);var h={relatedTarget:this};if(f.trigger(d=a.Event("show.bs.dropdown",h)),d.isDefaultPrevented())return;e.trigger("focus").attr("aria-expanded","true"),f.toggleClass("open").trigger(a.Event("shown.bs.dropdown",h))}return!1}},g.prototype.keydown=function(c){if(/(38|40|27|32)/.test(c.which)&&!/input|textarea/i.test(c.target.tagName)){var d=a(this);if(c.preventDefault(),c.stopPropagation(),!d.is(".disabled, :disabled")){var e=b(d),g=e.hasClass("open");if(!g&&27!=c.which||g&&27==c.which)return 27==c.which&&e.find(f).trigger("focus"),d.trigger("click");var h=" li:not(.disabled):visible a",i=e.find(".dropdown-menu"+h);if(i.length){var j=i.index(c.target);38==c.which&&j>0&&j--,40==c.which&&jdocument.documentElement.clientHeight;this.$element.css({paddingLeft:!this.bodyIsOverflowing&&a?this.scrollbarWidth:"",paddingRight:this.bodyIsOverflowing&&!a?this.scrollbarWidth:""})},c.prototype.resetAdjustments=function(){this.$element.css({paddingLeft:"",paddingRight:""})},c.prototype.checkScrollbar=function(){var a=window.innerWidth;if(!a){var b=document.documentElement.getBoundingClientRect();a=b.right-Math.abs(b.left)}this.bodyIsOverflowing=document.body.clientWidth
',trigger:"hover focus",title:"",delay:0,html:!1,container:!1,viewport:{selector:"body",padding:0}},c.prototype.init=function(b,c,d){if(this.enabled=!0,this.type=b,this.$element=a(c),this.options=this.getOptions(d),this.$viewport=this.options.viewport&&a(a.isFunction(this.options.viewport)?this.options.viewport.call(this,this.$element):this.options.viewport.selector||this.options.viewport),this.inState={click:!1,hover:!1,focus:!1},this.$element[0]instanceof document.constructor&&!this.options.selector)throw new Error("`selector` option must be specified when initializing "+this.type+" on the window.document object!");for(var e=this.options.trigger.split(" "),f=e.length;f--;){var g=e[f];if("click"==g)this.$element.on("click."+this.type,this.options.selector,a.proxy(this.toggle,this));else if("manual"!=g){var h="hover"==g?"mouseenter":"focusin",i="hover"==g?"mouseleave":"focusout";this.$element.on(h+"."+this.type,this.options.selector,a.proxy(this.enter,this)),this.$element.on(i+"."+this.type,this.options.selector,a.proxy(this.leave,this))}}this.options.selector?this._options=a.extend({},this.options,{trigger:"manual",selector:""}):this.fixTitle()},c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.getOptions=function(b){return b=a.extend({},this.getDefaults(),this.$element.data(),b),b.delay&&"number"==typeof b.delay&&(b.delay={show:b.delay,hide:b.delay}),b},c.prototype.getDelegateOptions=function(){var b={},c=this.getDefaults();return this._options&&a.each(this._options,function(a,d){c[a]!=d&&(b[a]=d)}),b},c.prototype.enter=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusin"==b.type?"focus":"hover"]=!0),c.tip().hasClass("in")||"in"==c.hoverState?void(c.hoverState="in"):(clearTimeout(c.timeout),c.hoverState="in",c.options.delay&&c.options.delay.show?void(c.timeout=setTimeout(function(){"in"==c.hoverState&&c.show()},c.options.delay.show)):c.show())},c.prototype.isInStateTrue=function(){for(var a in this.inState)if(this.inState[a])return!0;return!1},c.prototype.leave=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);if(c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusout"==b.type?"focus":"hover"]=!1),!c.isInStateTrue())return clearTimeout(c.timeout),c.hoverState="out",c.options.delay&&c.options.delay.hide?void(c.timeout=setTimeout(function(){"out"==c.hoverState&&c.hide()},c.options.delay.hide)):c.hide()},c.prototype.show=function(){var b=a.Event("show.bs."+this.type);if(this.hasContent()&&this.enabled){this.$element.trigger(b);var d=a.contains(this.$element[0].ownerDocument.documentElement,this.$element[0]);if(b.isDefaultPrevented()||!d)return;var e=this,f=this.tip(),g=this.getUID(this.type);this.setContent(),f.attr("id",g),this.$element.attr("aria-describedby",g),this.options.animation&&f.addClass("fade");var h="function"==typeof this.options.placement?this.options.placement.call(this,f[0],this.$element[0]):this.options.placement,i=/\s?auto?\s?/i,j=i.test(h);j&&(h=h.replace(i,"")||"top"),f.detach().css({top:0,left:0,display:"block"}).addClass(h).data("bs."+this.type,this),this.options.container?f.appendTo(this.options.container):f.insertAfter(this.$element),this.$element.trigger("inserted.bs."+this.type);var k=this.getPosition(),l=f[0].offsetWidth,m=f[0].offsetHeight;if(j){var n=h,o=this.getPosition(this.$viewport);h="bottom"==h&&k.bottom+m>o.bottom?"top":"top"==h&&k.top-mo.width?"left":"left"==h&&k.left-lg.top+g.height&&(e.top=g.top+g.height-i)}else{var j=b.left-f,k=b.left+f+c;jg.right&&(e.left=g.left+g.width-k)}return e},c.prototype.getTitle=function(){var a,b=this.$element,c=this.options;return a=b.attr("data-original-title")||("function"==typeof c.title?c.title.call(b[0]):c.title)},c.prototype.getUID=function(a){do a+=~~(1e6*Math.random());while(document.getElementById(a));return a},c.prototype.tip=function(){if(!this.$tip&&(this.$tip=a(this.options.template),1!=this.$tip.length))throw new Error(this.type+" `template` option must consist of exactly 1 top-level element!");return this.$tip},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".tooltip-arrow")},c.prototype.enable=function(){this.enabled=!0},c.prototype.disable=function(){this.enabled=!1},c.prototype.toggleEnabled=function(){this.enabled=!this.enabled},c.prototype.toggle=function(b){var c=this;b&&(c=a(b.currentTarget).data("bs."+this.type),c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c))),b?(c.inState.click=!c.inState.click,c.isInStateTrue()?c.enter(c):c.leave(c)):c.tip().hasClass("in")?c.leave(c):c.enter(c)},c.prototype.destroy=function(){var a=this;clearTimeout(this.timeout),this.hide(function(){a.$element.off("."+a.type).removeData("bs."+a.type),a.$tip&&a.$tip.detach(),a.$tip=null,a.$arrow=null,a.$viewport=null,a.$element=null})};var d=a.fn.tooltip;a.fn.tooltip=b,a.fn.tooltip.Constructor=c,a.fn.tooltip.noConflict=function(){return a.fn.tooltip=d,this}}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.popover"),f="object"==typeof b&&b;!e&&/destroy|hide/.test(b)||(e||d.data("bs.popover",e=new c(this,f)),"string"==typeof b&&e[b]())})}var c=function(a,b){this.init("popover",a,b)};if(!a.fn.tooltip)throw new Error("Popover requires tooltip.js");c.VERSION="3.3.7",c.DEFAULTS=a.extend({},a.fn.tooltip.Constructor.DEFAULTS,{placement:"right",trigger:"click",content:"",template:''}),c.prototype=a.extend({},a.fn.tooltip.Constructor.prototype),c.prototype.constructor=c,c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.setContent=function(){var a=this.tip(),b=this.getTitle(),c=this.getContent();a.find(".popover-title")[this.options.html?"html":"text"](b),a.find(".popover-content").children().detach().end()[this.options.html?"string"==typeof c?"html":"append":"text"](c),a.removeClass("fade top bottom left right in"),a.find(".popover-title").html()||a.find(".popover-title").hide()},c.prototype.hasContent=function(){return this.getTitle()||this.getContent()},c.prototype.getContent=function(){var a=this.$element,b=this.options;return a.attr("data-content")||("function"==typeof b.content?b.content.call(a[0]):b.content)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".arrow")};var d=a.fn.popover;a.fn.popover=b,a.fn.popover.Constructor=c,a.fn.popover.noConflict=function(){return a.fn.popover=d,this}}(jQuery),+function(a){"use strict";function b(c,d){this.$body=a(document.body),this.$scrollElement=a(a(c).is(document.body)?window:c),this.options=a.extend({},b.DEFAULTS,d),this.selector=(this.options.target||"")+" .nav li > a",this.offsets=[],this.targets=[],this.activeTarget=null,this.scrollHeight=0,this.$scrollElement.on("scroll.bs.scrollspy",a.proxy(this.process,this)),this.refresh(),this.process()}function c(c){return this.each(function(){var d=a(this),e=d.data("bs.scrollspy"),f="object"==typeof c&&c;e||d.data("bs.scrollspy",e=new b(this,f)),"string"==typeof c&&e[c]()})}b.VERSION="3.3.7",b.DEFAULTS={offset:10},b.prototype.getScrollHeight=function(){return this.$scrollElement[0].scrollHeight||Math.max(this.$body[0].scrollHeight,document.documentElement.scrollHeight)},b.prototype.refresh=function(){var b=this,c="offset",d=0;this.offsets=[],this.targets=[],this.scrollHeight=this.getScrollHeight(),a.isWindow(this.$scrollElement[0])||(c="position",d=this.$scrollElement.scrollTop()),this.$body.find(this.selector).map(function(){var b=a(this),e=b.data("target")||b.attr("href"),f=/^#./.test(e)&&a(e);return f&&f.length&&f.is(":visible")&&[[f[c]().top+d,e]]||null}).sort(function(a,b){return a[0]-b[0]}).each(function(){b.offsets.push(this[0]),b.targets.push(this[1])})},b.prototype.process=function(){var a,b=this.$scrollElement.scrollTop()+this.options.offset,c=this.getScrollHeight(),d=this.options.offset+c-this.$scrollElement.height(),e=this.offsets,f=this.targets,g=this.activeTarget;if(this.scrollHeight!=c&&this.refresh(),b>=d)return g!=(a=f[f.length-1])&&this.activate(a);if(g&&b=e[a]&&(void 0===e[a+1]||b .dropdown-menu > .active").removeClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!1),b.addClass("active").find('[data-toggle="tab"]').attr("aria-expanded",!0),h?(b[0].offsetWidth,b.addClass("in")):b.removeClass("fade"),b.parent(".dropdown-menu").length&&b.closest("li.dropdown").addClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!0),e&&e()}var g=d.find("> .active"),h=e&&a.support.transition&&(g.length&&g.hasClass("fade")||!!d.find("> .fade").length);g.length&&h?g.one("bsTransitionEnd",f).emulateTransitionEnd(c.TRANSITION_DURATION):f(),g.removeClass("in")};var d=a.fn.tab;a.fn.tab=b,a.fn.tab.Constructor=c,a.fn.tab.noConflict=function(){return a.fn.tab=d,this};var e=function(c){c.preventDefault(),b.call(a(this),"show")};a(document).on("click.bs.tab.data-api",'[data-toggle="tab"]',e).on("click.bs.tab.data-api",'[data-toggle="pill"]',e)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.affix"),f="object"==typeof b&&b;e||d.data("bs.affix",e=new c(this,f)),"string"==typeof b&&e[b]()})}var c=function(b,d){this.options=a.extend({},c.DEFAULTS,d),this.$target=a(this.options.target).on("scroll.bs.affix.data-api",a.proxy(this.checkPosition,this)).on("click.bs.affix.data-api",a.proxy(this.checkPositionWithEventLoop,this)),this.$element=a(b),this.affixed=null,this.unpin=null,this.pinnedOffset=null,this.checkPosition()};c.VERSION="3.3.7",c.RESET="affix affix-top affix-bottom",c.DEFAULTS={offset:0,target:window},c.prototype.getState=function(a,b,c,d){var e=this.$target.scrollTop(),f=this.$element.offset(),g=this.$target.height();if(null!=c&&"top"==this.affixed)return e=a-d&&"bottom"},c.prototype.getPinnedOffset=function(){if(this.pinnedOffset)return this.pinnedOffset;this.$element.removeClass(c.RESET).addClass("affix");var a=this.$target.scrollTop(),b=this.$element.offset();return this.pinnedOffset=b.top-a},c.prototype.checkPositionWithEventLoop=function(){setTimeout(a.proxy(this.checkPosition,this),1)},c.prototype.checkPosition=function(){if(this.$element.is(":visible")){var b=this.$element.height(),d=this.options.offset,e=d.top,f=d.bottom,g=Math.max(a(document).height(),a(document.body).height());"object"!=typeof d&&(f=e=d),"function"==typeof e&&(e=d.top(this.$element)),"function"==typeof f&&(f=d.bottom(this.$element));var h=this.getState(g,b,e,f);if(this.affixed!=h){null!=this.unpin&&this.$element.css("top","");var i="affix"+(h?"-"+h:""),j=a.Event(i+".bs.affix");if(this.$element.trigger(j),j.isDefaultPrevented())return;this.affixed=h,this.unpin="bottom"==h?this.getPinnedOffset():null,this.$element.removeClass(c.RESET).addClass(i).trigger(i.replace("affix","affixed")+".bs.affix")}"bottom"==h&&this.$element.offset({top:g-b-f})}};var d=a.fn.affix;a.fn.affix=b,a.fn.affix.Constructor=c,a.fn.affix.noConflict=function(){return a.fn.affix=d,this},a(window).on("load",function(){a('[data-spy="affix"]').each(function(){var c=a(this),d=c.data();d.offset=d.offset||{},null!=d.offsetBottom&&(d.offset.bottom=d.offsetBottom),null!=d.offsetTop&&(d.offset.top=d.offsetTop),b.call(c,d)})})}(jQuery); --------------------------------------------------------------------------------