There are no non-repeating variables in this part--most likely the contents of this part is in a repeating group above.
35 | {% endif %}
36 | {% endblock %}
37 |
--------------------------------------------------------------------------------
/irsdb/metadata/irsx_utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | # field names for 'variables.csv' file
4 | VARIABLE_FIELDNAMES = [
5 | 'parent_sked', 'parent_sked_part', 'in_a_group', 'db_table', 'ordering',
6 | 'db_name', 'xpath', 'irs_type', 'db_type', 'line_number', 'description',
7 | 'versions'
8 | ]
9 |
10 | GROUP_FIELDNAMES = [
11 | 'parent_sked', 'parent_sked_part', 'ordering', 'xpath', 'db_name',
12 | 'line_number', 'description', 'headless', 'versions'
13 | ]
14 |
15 | SCHEDULE_PART_FIELDNAMES = [
16 | 'parent_sked', 'parent_sked_part', 'ordering',
17 | 'part_name', 'xml_root', 'is_shell'
18 | ]
19 |
20 | def get_writer(outfilename, fieldnames):
21 | """ Returns a writer that writes to the csv 'spec' we use
22 | Keeping files consistent makes file diffs more readable.
23 | """
24 | outfile = open(outfilename, 'w') # 'wb' python 2?
25 | writer = csv.DictWriter(
26 | outfile,
27 | fieldnames=fieldnames,
28 | delimiter=',',
29 | quotechar='"',
30 | lineterminator='\n',
31 | quoting=csv.QUOTE_MINIMAL
32 | )
33 | writer.writeheader()
34 | return writer
35 |
36 | def get_variable_writer(outfilename):
37 | return get_writer(outfilename, VARIABLE_FIELDNAMES)
38 |
39 | def get_group_writer(outfilename):
40 | return get_writer(outfilename, GROUP_FIELDNAMES)
41 |
42 | def get_schedule_parts_writer(outfilename):
43 | return get_writer(outfilename, SCHEDULE_PART_FIELDNAMES)
44 |
45 | def clean_value(value):
46 | """ This gets run on every value """
47 | value = value.lstrip(" ") # Remove leading whitespace
48 | if value=='NA': # Throw out NA's
49 | return ''
50 | return value
51 |
52 | def fix_row(rowdict):
53 | for key in rowdict.keys():
54 | rowdict[key] = clean_value(rowdict[key])
55 | return rowdict
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # PyCharm
104 | .idea/
105 |
106 | # Migrations
107 | irsdb/filing/migrations/
108 | irsdb/metadata/migrations/
109 | irsdb/return/migrations/
110 |
--------------------------------------------------------------------------------
/irsdb/templates/metadata/variable.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 |
4 | {% block title %}{{ this_variable.db_table }}-{{ this_variable.db_name }}{% endblock %}
5 |
6 | {% block content %}
7 | The listing of form parts and repeating groups below describes the consistent structure and naming convention IRSx uses to represent nonprofit tax filings. Click on a form part or a repeating group named below to see the invidividual line items, their database table and names, and the line number and description given to them in the IRS' metadata. For more, see the about page.
12 |
22 |
DB Table: {{ this_variable.db_table }}
23 |
DB Name: {{this_variable.db_name }}
24 |
25 |
Line number: {{this_variable.line_number }}
26 |
Description: {{this_variable.description }}
27 |
IRS Type: {{this_variable.irs_type }}
28 |
Repeating: {{this_variable.in_a_group }}
29 |
Years: {{this_variable.version_start }}-{{this_variable.version_end }}
30 |
31 | {% if line_numbers %}
32 |
Line numbers:
33 | {% for line_number in line_numbers %}
34 |
Line number: {{line_number.line_number }}
35 |
Years: {{line_number.version_start }} - {{line_number.version_end }}
36 |
37 | {% endfor %}
38 | {% endif %}
39 |
40 | {% if descriptions %}
41 |
Descriptions
42 | {% for description in descriptions %}
43 |
Description: {{description.description }}
44 |
45 | {% endfor %}
46 | {% endif %}
47 |
48 | {% endblock %}
49 |
--------------------------------------------------------------------------------
/irsdb/filing/management/commands/make_manifest.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from django.core.management.base import BaseCommand
4 |
5 | from filing.models import Filing
6 |
7 | foundation_manifest = "foundation_manifest.csv"
8 | ein_manifest = "ein_manifest.csv"
9 |
10 | output_file = "initial_manifest.csv"
11 |
12 | headers = ['ein', 'object_id', 'name', 'tax_period', 'form_type', 'is_most_recent', 'missing']
13 |
14 |
15 |
16 | class Command(BaseCommand):
17 | help = '''
18 | Read a source csv of eins, and find all filings.
19 | Record which is most recent report.
20 | Disregard form 990T.
21 | '''
22 |
23 | def write_ein_details(self, ein):
24 | print("\n\nprocessing ein '%s'" % ein)
25 | filings = Filing.objects.filter(ein=ein).order_by('-tax_period', '-sub_date')
26 | first = 1
27 | if filings:
28 | for filing in filings:
29 | if filing.return_type == '990T':
30 | continue
31 |
32 | this_filing_dict = {
33 | 'name':filing.taxpayer_name,
34 | 'form_type': filing.return_type,
35 | 'ein': filing.ein,
36 | 'tax_period': filing.tax_period,
37 | 'is_most_recent':first,
38 | 'object_id':filing.object_id
39 |
40 | }
41 | print("'%s' - %s - %s - %s - %s - %s" % (filing.taxpayer_name, filing.return_type, filing.sub_date, filing.tax_period, filing.return_id, filing.object_id))
42 | self.dw.writerow(this_filing_dict)
43 | first = 0
44 | else:
45 | this_filing_dict = {
46 | 'ein': ein,
47 | 'missing': 1
48 | }
49 | self.dw.writerow(this_filing_dict)
50 |
51 |
52 | def handle(self, *args, **options):
53 |
54 | outfilehandle = open(output_file, 'w')
55 | self.dw = csv.DictWriter(outfilehandle, headers, extrasaction='ignore')
56 | self.dw.writeheader()
57 |
58 | reader = open(foundation_manifest, 'r')
59 | for row in reader:
60 | self.write_ein_details(row.strip())
61 |
62 | reader = open(ein_manifest, 'r')
63 | for row in reader:
64 | self.write_ein_details(row.strip())
65 |
66 |
--------------------------------------------------------------------------------
/irsdb/return/management/commands/remove_half_loaded.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand
2 | from django.db import connection
3 |
4 | class Command(BaseCommand):
5 | help = '''
6 | remove all filings from a given year that appear to be loading errors; where parse_start is true and
7 | parse_complete = False.
8 | '''
9 |
10 | def add_arguments(self, parser):
11 | # Positional arguments
12 | parser.add_argument('year', nargs=1, type=int)
13 |
14 | def handle(self, *args, **options):
15 | self.cursor = connection.cursor()
16 | self.submission_year = int(options['year'][0])
17 |
18 | BASE_QUERY = "(select object_id from filing_filing where parse_started=True and parse_complete=False and submission_year=%s)" % self.submission_year
19 |
20 | all_tables = connection.introspection.table_names()
21 | for table in all_tables:
22 | if table.startswith('return'):
23 | query = "delete from %s where object_id in %s" % (table, BASE_QUERY)
24 | print("Running query: '%s' " % query)
25 | result = self.cursor.execute(query)
26 | print("Done '%s'\n" % result )
27 |
28 | cmds = [
29 | "update filing_filing set parse_started=False where parse_started = True and parse_complete=False and submission_year=%s" % self.submission_year,
30 | "update filing_filing set parse_complete=False where parse_complete = True and parse_complete=False and submission_year=%s" % self.submission_year,
31 | "update filing_filing set process_time=Null where parse_started = True and not process_time is Null and parse_complete=False and submission_year=%s" % self.submission_year,
32 | "update filing_filing set is_error=False where parse_started = True and is_error = True and parse_complete=False and submission_year=%s" % self.submission_year,
33 | "update filing_filing set key_error_count=Null where parse_started = True and not key_error_count is Null and parse_complete=False and submission_year=%s" % self.submission_year,
34 | "update filing_filing set error_details =Null where parse_started = True and not error_details is Null and parse_complete=False and submission_year=%s" % self.submission_year]
35 |
36 | for cmd in cmds:
37 | print("Running query: '%s' " % cmd)
38 | self.cursor.execute(cmd)
39 |
--------------------------------------------------------------------------------
/setup_supporting_tables.sh:
--------------------------------------------------------------------------------
1 | -- Write out reference charts: address and org_types
2 |
3 | DROP TABLE if exists address_table;
4 |
5 | SELECT
6 | return_returnheader990x_part_i.ein,
7 | return_returnheader990x_part_i.object_id,
8 | return_returnheader990x_part_i."RtrnHdr_TxPrdEndDt",
9 | return_returnheader990x_part_i."RtrnHdr_TxYr",
10 | return_returnheader990x_part_i."BsnssNm_BsnssNmLn1Txt",
11 | return_returnheader990x_part_i."BsnssNm_BsnssNmLn2Txt",
12 | return_returnheader990x_part_i."BsnssOffcr_PrsnNm",
13 | return_returnheader990x_part_i."BsnssOffcr_PrsnTtlTxt",
14 | return_returnheader990x_part_i."BsnssOffcr_PhnNm",
15 | return_returnheader990x_part_i."BsnssOffcr_EmlAddrssTxt",
16 | return_returnheader990x_part_i."BsnssOffcr_SgntrDt",
17 | return_returnheader990x_part_i."USAddrss_AddrssLn1Txt",
18 | return_returnheader990x_part_i."USAddrss_AddrssLn2Txt",
19 | return_returnheader990x_part_i."USAddrss_CtyNm",
20 | return_returnheader990x_part_i."USAddrss_SttAbbrvtnCd",
21 | return_returnheader990x_part_i."USAddrss_ZIPCd",
22 | return_returnheader990x_part_i."FrgnAddrss_AddrssLn1Txt",
23 | return_returnheader990x_part_i."FrgnAddrss_AddrssLn2Txt",
24 | return_returnheader990x_part_i."FrgnAddrss_CtyNm",
25 | return_returnheader990x_part_i."FrgnAddrss_PrvncOrSttNm",
26 | return_returnheader990x_part_i."FrgnAddrss_CntryCd"
27 | INTO address_table
28 | FROM return_returnheader990x_part_i;
29 |
30 |
31 | DROP INDEX IF EXISTS xx_990_address_oid_ein;
32 | CREATE INDEX xx_990_address_oid_ein ON address_table (object_id, ein);
33 |
34 |
35 | drop table if exists org_types;
36 |
37 | select distinct "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", ein, object_id, concat(ein, '/', object_id) as url_base into org_types from return_part_0;
38 |
39 | insert into org_types select distinct "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", ein, object_id, concat(ein, '/', object_id) as url_base from return_ez_part_0;
40 |
41 | insert into org_types("Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", ein, object_id, url_base) select distinct "Orgnztn501c3ExmptPFInd" as "Orgnztn501c3Ind", "Orgnztn501c3TxblPFInd" as "Orgnztn501cInd", "Orgnztn49471TrtdPFInd" as "Orgnztn49471NtPFInd", ein, object_id, concat(ein, '/', object_id) as url_base from return_pf_part_0;
42 |
43 | DROP INDEX IF EXISTS xx_990_entity_oid_ein;
44 | CREATE INDEX xx_990_entity_oid_ein ON org_types (object_id, ein);
45 |
--------------------------------------------------------------------------------
/directors.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | -- 990 Employees
4 |
5 |
6 | DROP TABLE IF EXISTS tmp_990_employees;
7 | SELECT address_table.*,
8 | '/IRS990' as form,
9 | "PrsnNm",
10 | "TtlTxt",
11 | "RprtblCmpFrmOrgAmt" as "CmpnstnAmt"
12 | INTO temporary table tmp_990_employees
13 | FROM return_Frm990PrtVIISctnA
14 | LEFT JOIN address_table ON return_Frm990PrtVIISctnA.ein = address_table.ein
15 | AND return_Frm990PrtVIISctnA.object_id=address_table.object_id;
16 |
17 | DROP TABLE IF EXISTS tmp_990_employees_types;
18 |
19 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990_employees.* into temporary table tmp_990_employees_types from tmp_990_employees left join org_types on tmp_990_employees.object_id = org_types.object_id and tmp_990_employees.ein = org_types.ein;
20 | \copy tmp_990_employees_types to '/data/file_exports/990_employees.csv' with csv header;
21 |
22 |
23 | -- EZ
24 |
25 |
26 |
27 | DROP TABLE IF EXISTS tmp_990ez_employees;
28 | SELECT address_table.*,
29 | '/IRS990EZ' as form,
30 | "PrsnNm",
31 | "TtlTxt",
32 | "CmpnstnAmt"
33 | INTO temporary table tmp_990EZ_employees
34 | FROM return_EZOffcrDrctrTrstEmpl
35 | LEFT JOIN address_table ON return_EZOffcrDrctrTrstEmpl.ein = address_table.ein
36 | AND return_EZOffcrDrctrTrstEmpl.object_id= address_table.object_id;
37 |
38 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990EZ_employees.* into temporary table tmp_990EZ_employees_types from tmp_990EZ_employees left join org_types on tmp_990EZ_employees.object_id = org_types.object_id and tmp_990EZ_employees.ein = org_types.ein;
39 | \copy tmp_990EZ_employees_types to '/data/file_exports/990EZ_employees.csv' with csv header;
40 |
41 |
42 | -- PF
43 |
44 |
45 | DROP TABLE IF EXISTS tmp_990PF_employees;
46 | SELECT address_table.*,
47 | '/IRS990PF' as form,
48 | "OffcrDrTrstKyEmpl_PrsnNm" AS "PrsnNm",
49 | "OffcrDrTrstKyEmpl_TtlTxt" AS "TtlTxt",
50 | "OffcrDrTrstKyEmpl_CmpnstnAmt" AS "CmpnstnAmt"
51 | INTO temporary table tmp_990PF_employees
52 | FROM return_PFOffcrDrTrstKyEmpl
53 | LEFT JOIN address_table ON return_PFOffcrDrTrstKyEmpl.ein = address_table.ein
54 | AND return_PFOffcrDrTrstKyEmpl.object_id= address_table.object_id;
55 |
56 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, tmp_990PF_employees.* into temporary table tmp_990PF_employees_types from tmp_990PF_employees left join org_types on tmp_990PF_employees.object_id = org_types.object_id and tmp_990PF_employees.ein = org_types.ein;
57 | \copy tmp_990PF_employees_types to '/data/file_exports/990PF_employees.csv' with csv header;
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/irsdb/metadata/management/commands/run_bake.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from django.core.management.base import BaseCommand
4 | from metadata.models import *
5 |
6 | from django.conf import settings
7 |
8 | import requests
9 |
10 | METADATA_DIRECTORY = settings.METADATA_DIRECTORY
11 | REPORT_COUNT = 100
12 |
13 |
14 | FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE
15 |
16 |
17 | class Command(BaseCommand):
18 | help = """
19 | Bake the site out to files.
20 | """
21 |
22 | def hit_url(self, url):
23 | print("Baking out url %s" % url)
24 | requests.get(url)
25 |
26 | def run_parts(self):
27 | all_parts = SchedulePart.objects.all()
28 | for part in all_parts:
29 | url = "http://localhost:8000/metadata/parts/" + part.parent_sked_part + ".html"
30 | self.hit_url(url)
31 |
32 | def run_groups(self):
33 | all_groups = Group.objects.all()
34 | for group in all_groups:
35 | url = "http://localhost:8000/metadata/groups/" + group.db_name + ".html"
36 | self.hit_url(url)
37 |
38 | def run_variables(self):
39 | #re_path(r'variable/(?P
[\w\d\_]+)\-(?P[\w\d]+).html$', views.show_variable),
40 | all_variables = Variable.objects.all()
41 | for var in all_variables:
42 | var_url = "http://localhost:8000/metadata/variable/" + var.db_table + "-" + var.db_name + ".html"
43 | self.hit_url(var_url)
44 | xpath_url = "http://localhost:8000/metadata/xpath/"+ var.xpath.replace("/","-") + ".html"
45 | self.hit_url(xpath_url)
46 |
47 |
48 | def run_xpaths(self):
49 | all_xpaths = Variable.objects.all()
50 | for xpath in all_xpaths:
51 | print(xpath)
52 | #url = "http://localhost:8000/metadata/variable/" + var.db_table + "-" + var.db_name + ".html"
53 | #self.hit_url(url)
54 |
55 | def run_nav(self):
56 | self.hit_url("http://localhost:8000/metadata/about.html")
57 | self.hit_url("http://localhost:8000/metadata/forms.html")
58 |
59 | def create_dirs(self):
60 | for subdir in ["parts", "groups", "variable", "xpath"]:
61 | try:
62 | os.makedirs(os.path.join( FILE_SYSTEM_BASE, "metadata", subdir))
63 | except FileExistsError:
64 | print("File %s exists skipping" % subdir)
65 |
66 | def handle(self, *args, **options):
67 | print("Baking out urls")
68 | self.create_dirs()
69 | self.run_nav()
70 | self.run_parts()
71 | self.run_groups()
72 | self.run_variables()
73 |
74 | """
75 | re_path(r'xpath/(?P.+).html', views.show_xpath),
76 | re_path(r'variable/(?P[\w\d\_]+)\-(?P[\w\d]+).html$', views.show_variable),
77 | """
78 |
79 |
--------------------------------------------------------------------------------
/irsdb/templates/base.html:
--------------------------------------------------------------------------------
1 | {% load static %}
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | {% block title %}{% endblock %}
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 | {% block content %}
52 | {% endblock %}
53 |
54 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/irsdb/schemas/management/commands/find_empty_heads.py:
--------------------------------------------------------------------------------
1 | import os, csv
2 |
3 | from django.core.management.base import BaseCommand
4 | from django.apps import apps
5 |
6 | from django.conf import settings
7 |
8 | from irsx.settings import METADATA_DIRECTORY
9 |
10 | class Command(BaseCommand):
11 | help = """ Find 'empty heads' with no values at all.
12 | """
13 |
14 | def get_var_hash(self):
15 | variablefile = os.path.join(METADATA_DIRECTORY, 'variables.csv')
16 | variables = []
17 | with open(variablefile, 'r') as variablefh:
18 | reader = csv.DictReader(variablefh)
19 | for row in reader:
20 | key = row['db_table'] + "_" + row['db_name']
21 | variables.append({'key':key, 'xpath':row['xpath'], 'row':row})
22 | #print("\t - %s" % key)
23 | self.variables = variables
24 |
25 | def find_children(self, key):
26 | results = []
27 | for var in self.variables:
28 | if var['xpath'].startswith(key):
29 | results.append({'name': var['key'], 'xpath': var['xpath'], 'row':var['row']})
30 | return results
31 |
32 | def find_match(self, key):
33 | for var in self.variables:
34 | if var['xpath'] == key:
35 | return var
36 | return None
37 |
38 | def find_empty_heads(self):
39 | count = 0
40 | for var in self.variables:
41 | key = var['xpath'] + "/"
42 | #print("Finding var %s" % key)
43 | children = self.find_children(key)
44 | if len(children) > 2:
45 | print("\n\nHandling xpath=%s" % var['xpath'])
46 | print("db_table %s; db_name:%s" % (var['row']['db_table'],var['row']['db_name'] ))
47 | print("Num children: %s" % (len(children)))
48 |
49 |
50 | print("select count(*) from return_%s where not \"%s\" is null;" % (var['row']['db_table'],var['row']['db_name']))
51 | this_model = apps.get_model(app_label='return', model_name=var['row']['db_table'])
52 | if this_model:
53 | #print ("Got model %s name %s" % (this_model, var['row']['db_table'] ))
54 | pass
55 | else:
56 | print("model missing %s" % var['row']['db_table'] )
57 | assert False
58 |
59 | # now see how many elements there are.
60 | fieldname = var['row']['db_name']
61 | notnullcount = this_model.objects.filter(**{fieldname+'__isnull': False}).count()
62 | print("Count of this field is %s" % notnullcount)
63 | if notnullcount == 0:
64 | count += 1
65 | self.writer.writerow([var['xpath']])
66 |
67 | print("Total suspected empty heads: %s" % count)
68 |
69 |
70 | def handle(self, *args, **options):
71 | outfile = open("emptyheads.csv", "w")
72 |
73 | self.writer = csv.writer(outfile)
74 | self.variables = None
75 | self.get_var_hash()
76 | self.find_empty_heads()
77 |
--------------------------------------------------------------------------------
/irsdb/filing/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 | from django.conf import settings
3 | import os
4 | import re
5 |
6 | from irsx import settings as irsx_settings
7 | XML_DIR = irsx_settings.WORKING_DIRECTORY
8 |
9 | VERSION_RE=re.compile(r'returnVersion="(20\d\dv\d\.\d)"')
10 |
11 | class Filing(models.Model):
12 |
13 | # This is set from the index file.
14 | submission_year = models.IntegerField(blank=False, null=False, default=0, help_text="Index file year")
15 |
16 | # Verbatim fields set from the csv file
17 | return_id = models.CharField(max_length=8, blank=False, null=False, default="", help_text="Return ID")
18 | filing_type = models.CharField(max_length=5, blank=False, null=False, default="", help_text="Always EFILE")
19 | ein = models.CharField(max_length=9, blank=False, null=False, default="", help_text="Employer ID number")
20 | tax_period = models.IntegerField(blank=False, null=False, default=0, help_text="Month filed, YYYYMM")
21 | sub_date = models.CharField(max_length=22, blank=False, null=False, default="", help_text="Submitted date in "
22 | "YYYY-MM-DD format. But submitted to whom?")
23 | taxpayer_name = models.CharField(max_length=100, blank=False, null=False, default="", help_text="Organization name")
24 | return_type = models.CharField(max_length=5, blank=False, null=False, default="", help_text="Return type")
25 | dln = models.CharField(max_length=14, blank=False, null=False, default="", help_text="Document Locator Number")
26 | object_id = models.CharField(max_length=18, blank=False, null=False, default="", help_text="IRS-assigned unique ID")
27 |
28 | # fields we set after processing
29 | schema_version = models.TextField(null=True, help_text="schema version as it appears, e.g. 2015v2.1 ")
30 | tax_year = models.IntegerField(blank=True, null=True, help_text="The year of the tax period, set this from "
31 | "tax_period")
32 |
33 | # Processing notes
34 | parse_started = models.NullBooleanField(help_text="Set this true when parsing begins")
35 | parse_complete = models.NullBooleanField(null=True, help_text="Set true when data stored")
36 | process_time = models.DateTimeField(null=True, help_text="When was parsing complete?")
37 | is_error = models.NullBooleanField(help_text="Was an error of any type encountered during parsing")
38 | key_error_count = models.IntegerField(blank=True, null=True, help_text="Number of key errors found")
39 | error_details = models.TextField(null=True, help_text="Describe error condition")
40 |
41 | def get_aws_URL(self):
42 | return "https://s3.amazonaws.com/irs-form-990/%s_public.xml" % self.object_id
43 |
44 | def get_local_URL(self):
45 | return os.path.join(XML_DIR, "%s_public.xml" % self.object_id)
46 |
47 | def set_schema_version(self):
48 | """
49 | Sets the schema version by trying to read top of file locally.
50 | Efficient b/c it doesn't parse xml, just runs regex on files second line.
51 | Doesn't set if file is missing.
52 | """
53 | filepath = self.get_local_URL()
54 | try:
55 | infile = open(filepath, "r")
56 | except FileNotFoundError:
57 | print("File %s is missing, quitting" % filepath)
58 | return False
59 | top = infile.read(1024)
60 | infile.close()
61 | returnline = top.split("\n")[1]
62 | result = VERSION_RE.search(returnline)
63 | if result:
64 | if result != self.schema_version:
65 | self.schema_version = result.group(1)
66 | self.save()
67 | else:
68 | print("No result in object_id: %s returnline:%s" % (self.object_id, returnline))
69 |
70 |
71 | class Meta:
72 | managed = True
73 | indexes = [
74 | models.Index(fields=['object_id']),
75 | ]
76 |
--------------------------------------------------------------------------------
/irsdb/filing/management/commands/enter_yearly_submissions.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import requests
4 |
5 | from django.core.management.base import BaseCommand
6 | from filing.models import Filing
7 | from django.conf import settings
8 | from irsx.settings import INDEX_DIRECTORY
9 | from irsx.file_utils import stream_download
10 |
11 | BATCH_SIZE = 10000
12 |
13 |
14 | class Command(BaseCommand):
15 | help = '''
16 | Read the yearly csv file line by line and add new lines if
17 | they don't exist. Lines are added in bulk at the end.
18 | '''
19 |
20 | def add_arguments(self, parser):
21 | # Positional arguments
22 | parser.add_argument('year', nargs='+', type=int)
23 |
24 | def handle(self, *args, **options):
25 | for year in options['year']:
26 | local_file_path = os.path.join(INDEX_DIRECTORY, "index_%s.csv" % year)
27 |
28 |
29 | print("Entering xml submissions from %s" % local_file_path)
30 | fh = open(local_file_path, 'r')
31 | reader = csv.reader(fh)
32 | rows_to_enter = []
33 |
34 | # ignore header rows
35 |
36 | # python 2 idiom: headers = reader.next() <--- but this is a django 2 thing, so no python 2.X
37 | next(reader)
38 | count = 0
39 | for line in reader:
40 | try:
41 | # sometimes there's an empty extra column, ignore it
42 | # RETURN_ID,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID
43 | #(return_id, ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) = line[0:8]
44 |
45 | ## for newer style index forms 2020 and on, perhaps
46 | (return_id, filing_type,ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) = line[0:9]
47 | # RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID
48 | #print(return_id, ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id)
49 |
50 | ### select tax_period, parse_complete, count(*) from filing_filing where parse_started is null group by 1,2 order by 1,2;
51 |
52 | ### delete from filing_filing where parse_complete is null and tax_period like '2020%';
53 |
54 | ### select tax_period, parse_complete, count(*) from filing_filing where parse_complete is null and tax_period like '2020%' group by 1,2 order by 1,2;
55 | except ValueError as err:
56 | print("Error with line: {line}".format(line=line))
57 | if year == 2014:
58 | print('Did you fix the 2014 index file? See the README for instructions.')
59 | raise
60 |
61 | try:
62 | obj = Filing.objects.get(object_id=object_id)
63 | except Filing.DoesNotExist:
64 | new_sub = Filing(
65 | return_id=return_id,
66 | submission_year=year,
67 | ein=ein,
68 | tax_period=tax_period,
69 | sub_date=sub_date,
70 | taxpayer_name=taxpayer_name,
71 | return_type=return_type,
72 | dln=dln,
73 | object_id=object_id
74 | )
75 |
76 | rows_to_enter.append(new_sub)
77 | count += 1
78 |
79 | if count % BATCH_SIZE == 0 and count > 0:
80 | print("Committing %s total entered=%s" % (BATCH_SIZE, count))
81 | Filing.objects.bulk_create(rows_to_enter)
82 | print("commit complete")
83 | rows_to_enter = []
84 |
85 | Filing.objects.bulk_create(rows_to_enter)
86 | print("Added %s new entries." % count)
87 |
--------------------------------------------------------------------------------
/irsdb/metadata/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Base for import of metadata csv files
4 | class IRSxBase(models.Model):
5 | parent_sked = models.CharField(max_length=63, blank=True, null=True, help_text="Schedule name", editable=False)
6 | parent_sked_part = models.CharField(max_length=63, blank=True, null=True, help_text="db compliant name; NA for ScheduleParts")
7 | ordering = models.FloatField(null=True, blank=True, help_text="sort order of parts")
8 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False)
9 |
10 | class Meta:
11 | abstract = True
12 |
13 | class Variable(IRSxBase):
14 | in_a_group = models.BooleanField(help_text="is this variable in a group", default=False)
15 | db_table = models.CharField(max_length=63, blank=True, null=True, help_text="db table", editable=False)
16 | db_name = models.CharField(max_length=63, blank=True, null=True, help_text="db name", editable=False)
17 | irs_type = models.CharField(max_length=63, blank=True, null=True, help_text="IRS Type", editable=False)
18 | db_type = models.CharField(max_length=63, blank=True, null=True, help_text="db type", editable=False)
19 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS line number. Missing in returnheader", editable=False)
20 | description = models.TextField(help_text="IRS-supplied description, from .xsd. ")
21 | version_start = models.TextField(help_text="Start year", null=True)
22 | version_end = models.TextField(help_text="End year", null=True)
23 | is_canonical = models.NullBooleanField(help_text="", default=False)
24 | canonical_version = models.CharField(max_length=16, blank=True, null=True, help_text="canonical_version", editable=False)
25 |
26 | def get_absolute_url(self):
27 | return ("/metadata/variable/%s-%s.html" % (self.db_table, self.db_name))
28 |
29 | class Group(IRSxBase):
30 | db_name = models.CharField(max_length=63, blank=True, null=True, help_text="DB name", editable=False)
31 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
32 | description = models.TextField(help_text="IRS-supplied description, from .xsd. ")
33 | headless = models.NullBooleanField(help_text="", default=False)
34 | version_start = models.TextField(help_text="Start year", null=True)
35 | version_end = models.TextField(help_text="End year", null=True)
36 |
37 | def get_absolute_url(self):
38 | return ("/metadata/groups/%s.html" % self.db_name)
39 |
40 | class SchedulePart(IRSxBase):
41 | part_name = models.CharField(max_length=255, blank=True, null=True, help_text="Part Name.", editable=False)
42 | xml_root = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath?
43 | is_shell = models.NullBooleanField(help_text="", default=False)
44 |
45 | def get_absolute_url(self):
46 | return ("/metadata/parts/%s.html" % self.parent_sked_part)
47 |
48 |
49 | # For historic reference to precise line_numbers, descriptions
50 |
51 | class LineNumber(models.Model):
52 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath?
53 | version_start = models.TextField(help_text="Start year", null=True)
54 | version_end = models.TextField(help_text="End year", null=True)
55 | line_number = models.CharField(max_length=255, blank=True, null=True, help_text="IRS-supplied line numbers. Missing for returnheaders", editable=False)
56 |
57 | class Description(models.Model):
58 | xpath = models.CharField(max_length=255, blank=True, null=True, help_text="xpath", editable=False) #is this not equivalent to xpath?
59 | version_start = models.TextField(help_text="Start year", null=True)
60 | version_end = models.TextField(help_text="End year", null=True)
61 | description = models.TextField(help_text="description")
62 |
--------------------------------------------------------------------------------
/irsdb/schemas/model_accumulator.py:
--------------------------------------------------------------------------------
1 | from django.apps import apps
2 | from django.forms import model_to_dict
3 |
4 | # Setting too big will create memory problems
5 | BATCH_SIZE = 100
6 | VERBOSE = False
7 |
8 | # TODO: allow appname to be passed as an argument.
9 | APPNAME = 'return'
10 |
11 | listtype = type([])
12 |
13 | class Accumulator(object):
14 |
15 | def __init__(self):
16 | self.model_dict = {}
17 | self.model_cache = {}
18 | # Expected:
19 | # self.model_dict{model_name: [modeldictionary1, modeldictionary2,]...}
20 |
21 | def _clean_restricted(self, dict):
22 | """ RESTRICTED is only sked b, SSN's appear in a variety of places
23 | we could do a better job of restricting this
24 | """
25 | for key in dict.keys():
26 | if type(dict[key])==listtype:
27 | print("\n\n***list found %s" % (key))
28 |
29 |
30 | # IRS will replace anything they think is a SSN with "XXX-XX-XXXX"
31 | # this seems to include 9 digit numbers.
32 | # The result is that the irs can lengthen fields (breaking max_length)
33 | # by doing this, so use a formulation that's shorter than this.
34 | if dict[key]:
35 | dict[key]=dict[key].replace('XXX-XX-XXXX', '-SSN-')
36 |
37 | if dict[key]=='RESTRICTED':
38 | # These are numeric fields, don't try to save 'RESTRICTED'
39 | dict[key]=0
40 |
41 |
42 | def _get_model(self, model_name, appname='return'):
43 | # cache locally so django doesn't try to hit the db every time
44 | try:
45 | return self.model_cache[appname + model_name]
46 | except KeyError:
47 | self.model_cache[model_name] = apps.get_model(appname, model_name)
48 | return self.model_cache[model_name]
49 |
50 | def commit_by_key(self, model_name):
51 | if self.model_dict[model_name]:
52 | this_model = self._get_model(model_name)
53 | if (VERBOSE):
54 | print("Committing %s objects for key %s" % (
55 | len(self.model_dict[model_name]),
56 | model_name
57 | )
58 | )
59 | this_model.objects.bulk_create(self.model_dict[model_name])
60 |
61 | # set array to empty
62 | self.model_dict[model_name] = []
63 |
64 | def add_model(self, model_name, model_dict):
65 | # An artifact upstream is creating empty rows, with no name and only an ein and object_id
66 | # This is probably related to the 'empty head' rows in variables.csv, which will be excised by loading this db and analyzing
67 | if not model_name:
68 | print("###Model name is missing in object_id %s\ndict=%s!!!" % (model_dict['object_id'], model_dict))
69 | return
70 | this_model = self._get_model(model_name)
71 | self._clean_restricted(model_dict)
72 | model_instance = this_model(**model_dict)
73 | try:
74 | self.model_dict[model_name].append(model_instance)
75 |
76 | except KeyError:
77 | self.model_dict[model_name]= [model_instance]
78 |
79 | if len(self.model_dict[model_name]) >= BATCH_SIZE:
80 | self.commit_by_key(model_name)
81 |
82 | def commit_all(self):
83 | # commit everything
84 | if (VERBOSE):
85 | print("Running commit all! ")
86 | print(self.object_report())
87 | for thiskey in self.model_dict.keys():
88 | if (VERBOSE):
89 | print("Commit key %s" % thiskey)
90 | self.commit_by_key(thiskey)
91 |
92 | def count(self, model_name):
93 | return len(self.model_dict[model_name])
94 |
95 | def object_report(self):
96 | total = 0
97 | for i in self.model_dict.keys():
98 | thislen = self.count(i)
99 | total += thislen
100 | if thislen > 0:
101 | print("\t%s:%s" % (i, thislen))
102 | print("Total %s objects" % total)
103 |
--------------------------------------------------------------------------------
/irsdb/irsdb/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for irsdb project.
3 |
4 | Generated by 'django-admin startproject' using Django 2.0.1.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.0/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/2.0/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = '-9eh@&_nb_nvklo6fhck5&bts#+*@hubea7+!65kwr(r4%9$8='
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = True
27 |
28 | ALLOWED_HOSTS = []
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = [
34 | 'django.contrib.admin',
35 | 'django.contrib.auth',
36 | 'django.contrib.contenttypes',
37 | 'django.contrib.sessions',
38 | 'django.contrib.messages',
39 | 'django.contrib.staticfiles',
40 | 'metadata',
41 | 'filing',
42 | 'return',
43 | 'schemas',
44 |
45 | ]
46 |
47 | MIDDLEWARE = [
48 | 'django.middleware.security.SecurityMiddleware',
49 | 'django.contrib.sessions.middleware.SessionMiddleware',
50 | 'django.middleware.common.CommonMiddleware',
51 | 'django.middleware.csrf.CsrfViewMiddleware',
52 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
53 | 'django.contrib.messages.middleware.MessageMiddleware',
54 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
55 | ]
56 |
57 | ROOT_URLCONF = 'irsdb.urls'
58 |
59 | # TEMPLATES = [
60 | # {
61 | # 'BACKEND': 'django.template.backends.django.DjangoTemplates',
62 | # 'DIRS': [],
63 | # 'APP_DIRS': False,
64 | # 'OPTIONS': {
65 | # 'context_processors': [
66 | # 'django.template.context_processors.debug',
67 | # 'django.template.context_processors.request',
68 | # 'django.contrib.auth.context_processors.auth',
69 | # 'django.contrib.messages.context_processors.messages',
70 | # ],
71 | # },
72 | # },
73 | # ]
74 |
75 | WSGI_APPLICATION = 'irsdb.wsgi.application'
76 |
77 |
78 | # Database
79 | # https://docs.djangoproject.com/en/2.0/ref/settings/#databases
80 |
81 | # DATABASES = {
82 | # 'default': {
83 | # 'ENGINE': 'django.db.backends.sqlite3',
84 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
85 | # }
86 | # }
87 |
88 |
89 | # Password validation
90 | # https://docs.djangoproject.com/en/2.0/ref/settings/#auth-password-validators
91 |
92 | AUTH_PASSWORD_VALIDATORS = [
93 | {
94 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
95 | },
96 | {
97 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
98 | },
99 | {
100 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
101 | },
102 | {
103 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
104 | },
105 | ]
106 |
107 |
108 | # Internationalization
109 | # https://docs.djangoproject.com/en/2.0/topics/i18n/
110 |
111 | LANGUAGE_CODE = 'en-us'
112 |
113 | TIME_ZONE = 'UTC'
114 |
115 | USE_I18N = True
116 |
117 | USE_L10N = True
118 |
119 | USE_TZ = True
120 |
121 |
122 | # Static files (CSS, JavaScript, Images)
123 | # https://docs.djangoproject.com/en/2.0/howto/static-files/
124 |
125 | STATIC_URL = '/static/'
126 |
127 | STATICFILES_DIRS = [
128 | os.path.join(BASE_DIR, "static")
129 | ]
130 |
131 | # will break without irsx
132 | from irsx.settings import METADATA_DIRECTORY, KNOWN_SCHEDULES
133 |
134 | GENERATED_MODELS_DIR = os.path.join(BASE_DIR, "generated_schemas")
135 |
136 | ## suppress verbose complaints by uncommenting
137 | #import warnings
138 | #warnings.simplefilter(action='ignore', category=RuntimeWarning)
139 |
140 |
141 |
142 | try:
143 | from .local_settings import *
144 | except ImportError as e:
145 | print("Error importing local_settings.py\n%s" % e)
146 |
--------------------------------------------------------------------------------
/irsdb/metadata/management/commands/load_metadata.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | from django.core.management.base import BaseCommand
4 | from metadata.models import *
5 |
6 | from django.conf import settings
7 |
8 | #METADATA_DIRECTORY = settings.METADATA_DIRECTORY
9 | METADATA_DIRECTORY = settings.GENERATED_MODELS_DIR
10 | REPORT_COUNT = 100
11 | CANONICAL_VERSION = '2016v3.0'
12 |
13 | class Command(BaseCommand):
14 | help = """
15 | Erase and reload the metadata tables, one at a time.
16 | """
17 |
18 | def read_blacklist(self):
19 | # read the list of blacklisted xpaths, return a hash
20 | infilepath = os.path.join(METADATA_DIRECTORY, "emptyhead_blacklist.txt")
21 | infile = open(infilepath, 'r')
22 | for row in infile:
23 | xpath = row.replace("\n", "")
24 | print("blacklisting xpath '%s' " % xpath)
25 | self.blacklist[xpath] = True
26 |
27 | def reload_variables(self, *args, **options):
28 | print("Deleting variables.")
29 | Variable.objects.all().delete()
30 | infilepath = os.path.join(METADATA_DIRECTORY, "variables.csv")
31 | infile = open(infilepath, 'r')
32 | reader = csv.DictReader(infile)
33 | for i, row in enumerate(reader):
34 |
35 | try:
36 | self.blacklist[row['xpath']]
37 | # it's blacklisted, so ignore
38 | print("ignoring blacklisted xpath %s" % row['xpath'])
39 | except KeyError:
40 | # not blacklisted, so create it
41 |
42 | Variable.objects.create(**row)
43 | print("Total Variables %s" % i)
44 |
45 | def reload_groups(self, *args, **options):
46 | print("Deleting Groups.")
47 | Group.objects.all().delete()
48 | infilepath = os.path.join(METADATA_DIRECTORY, "groups.csv")
49 | infile = open(infilepath, 'r')
50 | reader = csv.DictReader(infile)
51 | for i, row in enumerate(reader):
52 | try:
53 | if row['headless'] == '':
54 | row['headless'] = None
55 | except KeyError:
56 | pass
57 | if i%REPORT_COUNT == 0:
58 | print("Created %s rows" % i)
59 | Group.objects.create(**row)
60 | print("Total Groups %s" % i)
61 |
62 | def reload_schedule_parts(self, *args, **options):
63 | print("Deleting ScheduleParts.")
64 | SchedulePart.objects.all().delete()
65 | infilepath = os.path.join(METADATA_DIRECTORY, "schedule_parts.csv")
66 | infile = open(infilepath, 'r')
67 | reader = csv.DictReader(infile)
68 | for i, row in enumerate(reader):
69 | try:
70 | if row['is_shell'] == '':
71 | row['is_shell'] = None
72 | except KeyError:
73 | pass
74 | if i%REPORT_COUNT == 0:
75 | print("Created %s rows" % i)
76 | SchedulePart.objects.create(**row)
77 | print("Total Schedule Parts %s" % i)
78 |
79 | def reload_line_numbers(self, *args, **options):
80 | print("Deleting LineNumbers.")
81 | LineNumber.objects.all().delete()
82 | infilepath = os.path.join(METADATA_DIRECTORY, "line_numbers.csv")
83 | infile = open(infilepath, 'r')
84 | reader = csv.DictReader(infile)
85 | for i, row in enumerate(reader):
86 | if i%REPORT_COUNT == 0:
87 | print("Created %s rows" % i)
88 |
89 | try:
90 | self.blacklist[row['xpath']]
91 | # it's blacklisted, so ignore
92 | print("ignoring blacklisted xpath line number... %s" % row['xpath'])
93 | except KeyError:
94 | # not blacklisted, so create it
95 | LineNumber.objects.create(**row)
96 | print("Total LineNumber created %s" % i)
97 |
98 | def reload_descriptions(self, *args, **options):
99 | print("Deleting Descriptions.")
100 | Description.objects.all().delete()
101 | infilepath = os.path.join(METADATA_DIRECTORY, "descriptions.csv")
102 | infile = open(infilepath, 'r')
103 | reader = csv.DictReader(infile)
104 | for i, row in enumerate(reader):
105 | if i%REPORT_COUNT == 0:
106 | print("Created %s rows" % i)
107 |
108 | try:
109 | self.blacklist[row['xpath']]
110 | # it's blacklisted, so ignore
111 | print("ignoring blacklisted xpath description... %s" % row['xpath'])
112 | except KeyError:
113 | # not blacklisted, so create it
114 | Description.objects.create(**row)
115 | print("Total Description created %s" % i)
116 |
117 | def handle(self, *args, **options):
118 | print("Running metadata load on variables.")
119 | self.blacklist = {}
120 |
121 | self.read_blacklist()
122 | self.reload_variables()
123 | self.reload_groups()
124 | self.reload_schedule_parts()
125 | self.reload_line_numbers()
126 | self.reload_descriptions()
--------------------------------------------------------------------------------
/irsdb/return/management/commands/load_filings.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import requests
4 |
5 | from datetime import datetime
6 |
7 | from django.core.management.base import BaseCommand
8 | from django.conf import settings
9 |
10 | from filing.models import Filing
11 | from schemas.model_accumulator import Accumulator
12 | from irsx.settings import INDEX_DIRECTORY
13 | from irsx.file_utils import stream_download
14 | from irsx.xmlrunner import XMLRunner
15 | from irsx.filing import FileMissingException
16 |
17 |
18 | # this is how many we process; there's a separate batch size
19 | # in model accumulator for how many are processed
20 | BATCH_SIZE = 1000
21 |
22 |
23 | class Command(BaseCommand):
24 | help = '''
25 | Enter the filings, one by one.
26 | Loading is done in bulk, though status on the filings is updated one at a time.
27 |
28 | '''
29 |
30 | def add_arguments(self, parser):
31 | # Positional arguments
32 | parser.add_argument('year', nargs=1, type=int)
33 |
34 | def setup(self):
35 | # get an XMLRunner -- this is what actually does the parsing
36 | self.xml_runner = XMLRunner()
37 | self.accumulator = Accumulator()
38 |
39 |
40 | def process_sked(self, sked):
41 | """ Enter just one schedule """
42 | #print("Processing schedule %s" % sked['schedule_name'])
43 | for part in sked['schedule_parts'].keys():
44 | partname = part
45 | partdata = sked['schedule_parts'][part]
46 | #print("part %s %s" % (partname, partdata))
47 |
48 | self.accumulator.add_model(partname, partdata)
49 |
50 | for groupname in sked['groups'].keys():
51 | for groupdata in sked['groups'][groupname]:
52 | #print("group %s %s" % (groupname, groupdata) )
53 | self.accumulator.add_model(groupname, groupdata)
54 |
55 |
56 | def run_filing(self, filing):
57 | object_id = filing.object_id
58 |
59 |
60 | parsed_filing = self.xml_runner.run_filing(object_id)
61 |
62 | if not parsed_filing:
63 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
64 | return None
65 |
66 | schedule_list = parsed_filing.list_schedules()
67 | #print("sked list is %s" % schedule_list)
68 |
69 | result = parsed_filing.get_result()
70 |
71 | keyerrors = parsed_filing.get_keyerrors()
72 | schema_version = parsed_filing.get_version()
73 | ## This could be disabled if we don't care about the schema version
74 | ## This is one save per loaded row...
75 | if filing.schema_version != schema_version:
76 | filing.schema_version = schema_version
77 | filing.save()
78 |
79 | if keyerrors:
80 | # If we find keyerrors--xpaths that are missing from our spec, note it
81 | print("Key error %s")
82 | has_keyerrors = len(keyerrors) > 0
83 | print("keyerror: %s" % keyerrors)
84 | filing.error_details = str(keyerrors)
85 | filing.key_error_count = len(keyerrors)
86 | filing.is_error = has_keyerrors
87 | filing.save()
88 |
89 | if result:
90 | for sked in result:
91 | self.process_sked(sked)
92 | else:
93 | print("Filing not parsed %s " % object_id)
94 |
95 |
96 | def handle(self, *args, **options):
97 |
98 | year = int(options['year'][0])
99 | if year not in [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]:
100 | raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2025" % year)
101 |
102 | print("Running filings during year %s" % year)
103 | self.setup()
104 |
105 | process_count = 0
106 | missing_filings = 0
107 | missed_file_list = []
108 |
109 | while True:
110 | filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100]
111 | if not filings:
112 | print("Done")
113 | break
114 |
115 | object_id_list = [f.object_id for f in filings]
116 |
117 | # record that processing has begun
118 | Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True)
119 |
120 |
121 | for filing in filings:
122 | #print("Handling id %s" % filing.object_id)
123 | try:
124 | self.run_filing(filing)
125 | except FileMissingException:
126 | print("File missing %s, skipping" % filing.object_id)
127 | missing_filings += 1
128 | missed_file_list.append(filing.object_id)
129 | process_count += 1
130 | if process_count % 1000 == 0:
131 | print("Handled %s filings" % process_count)
132 |
133 | # commit anything that's left
134 | self.accumulator.commit_all()
135 | # record that all are complete
136 | Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True)
137 | print("Processed a total of %s filings" % process_count)
138 | print("Total missing files: %s" % missing_filings)
139 | print("Missing %s" % missed_file_list)
140 |
--------------------------------------------------------------------------------
/irsdb/schemas/type_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Hand curated list of types that occur in IRS fillings, and their representations
3 | in django and sqlalchemy. The latter is pretty rough.
4 |
5 | Defaults to a text field when definitions are missing.
6 |
7 | In general the IRS adapts type definitions to shoehorn in old data,
8 | so using current definitions is usually good enough for older stuff... to a point.
9 |
10 | This list is tied to 2013 forwards, maybe this should be namespaced or linked somehow.
11 |
12 | USAmountType allows 15 digit ints, so should probably be mapped to biginteger or dealt with.
13 | """
14 |
15 | # A char field longer than MAX_CHAR_FIELD_SIZE will be a text field.
16 | # Best setting may be db dependent?
17 | MAX_CHAR_FIELD_SIZE = 200
18 |
19 | # based on 2015/2016 schemas, unclear if pre 2013 stuff will break this.
20 | var_types = {
21 | 'USAmountType':{'type':'Integer', 'length':15},
22 | 'BooleanType':{'type':'Char', 'length':5}, # http://www.datypic.com/sc/xsd/t-xsd_boolean.html, legal values = [0,1,'true', 'false']
23 | 'USAmountNNType':{'type':'Integer', 'length':15},
24 | 'CheckboxType':{'type':'Char', 'length':1}, # legal values = ['X']
25 | 'StreetAddressType':{'type':'Char', 'length':35},
26 | 'LineExplanationType':{'type':'Char', 'length':100},
27 | 'IntegerNNType':{'type':'Integer', 'length':15}, # max value is unclear, see xsd:Integer vs xsd:int
28 | 'BusinessNameLine2Type':{'type':'Char', 'length':75},
29 | 'BusinessNameLine1Type':{'type':'Char', 'length':75},
30 | 'RatioType':{'type':'Decimal', 'totalDigits':6, 'fractionDigits':5},
31 | 'StateType':{'type':'Char', 'length':2}, # need list key to translate back ?
32 | 'CountryType':{'type':'Char', 'length':2},
33 | 'ShortExplanationType':{'type':'Text', 'length':1000},
34 | 'CityType':{'type':'Char', 'length':22},
35 | 'ZIPCodeType':{'type':'Char', 'length':15}, # "ZIP Code - 5 digits plus optional 4 or 7 digits"
36 | 'PersonNameType':{'type':'Char', 'length':35},
37 | 'ExplanationType':{'type':'Text', 'length':9000},
38 | 'YearType':{'type':'Integer', 'length':4},
39 | 'EINType':{'type':'Char', 'length':9},
40 | 'DateType':{'type':'Char', 'length':31}, # unclear http://www.datypic.com/sc/xsd/s-datatypes.xsd.html
41 | 'ShortDescriptionType':{'type':'Char', 'length':20},
42 | 'CountType':{'type':'Integer', 'length':7}, # max length is 6
43 | 'Count2Type':{'type':'Integer', 'length':7}, # max length is 6
44 | 'PhoneNumberType':{'type':'Char', 'length':10},
45 | 'IRS990PFPartVDistriRatioType':{'type':'Decimal', 'totalDigits':12, 'fractionDigits':6}, # was 9, manual fix
46 | 'LargeRatioType':{'type':'Decimal', 'totalDigits':22, 'fractionDigits':12},
47 | 'DecimalNNType':{'type':'Decimal', 'totalDigits':22, 'fractionDigits':2}, # dunno upper bound
48 | 'EFINType':{'type':'Char', 'length':6}, # Type for Electronic Filing Identification No. - 6 digits
49 | 'PINType':{'type':'Char', 'length':5}, # Type for Practitioner PIN, Self-Select PIN and Third Party Designee PIN
50 | 'EmailAddressType':{'type':'Char', 'length':75},
51 | 'SoftwareVersionType':{'type':'Char', 'length':20},
52 | 'TimeType':{'type':'Char', 'length':15}, # Should be no more than 9 chars, but... [0-9]{2}:[0-9]{2}:[0-9]{2}
53 | 'CUSIPNumberType':{'type':'Char', 'length':9},
54 | 'SSNType':{'type':'Char', 'length':12}, # should be 9 but needs to fit "XXX-XX-XXXX" which has 11
55 | 'DeviceIdType':{'type':'Char', 'length':40},
56 | 'BusinessNameControlType':{'type':'Char', 'length':7}, # max is 4: ([A-Z0-9\-]|&){1,4}
57 | 'PersonTitleType':{'type':'Char', 'length':35},
58 | 'OriginatorType':{'type':'Char', 'length':15},
59 | 'TimestampType':{'type':'Char', 'length':63}, # not sure
60 | 'ISPType':{'type':'Char', 'length':6}, # Type for Intermediate Service Provider No. - 6 uppercase alphanumeric characters
61 | 'PTINType':{'type':'Char', 'length':9}, # Type for Preparer Personal Identification No. - P followed by 8 digits
62 | 'USAmountNegType':{'type':'Integer', 'length':15},
63 | 'IPv6Type':{'type':'Char', 'length':31}, # 1
64 | 'SoftwareIdType':{'type':'Char', 'length':8}, #The Software ID - 8 digits
65 | 'IPv4Type':{'type':'Char', 'length':31},
66 | 'InCareOfNameType':{'type':'Char', 'length':35},
67 | 'TimezoneType':{'type':'Char', 'length':31}
68 | }
69 |
70 | def get_django_type(vartype):
71 |
72 | try:
73 | thisvar = var_types[vartype]
74 | except KeyError:
75 | return "TextField(null=True, blank=True)"
76 |
77 | if (thisvar['type'] =='Integer'):
78 | if thisvar['length'] < 10:
79 | return "IntegerField(null=True, blank=True)" # 32 bit: Values from -2147483648 to 2147483647
80 | else:
81 | return "BigIntegerField(null=True, blank=True)" # 64 bit: from: -9223372036854775808 to 9223372036854775807
82 |
83 | elif (thisvar['type']=='Decimal'):
84 | return "DecimalField(null=True, blank=True, max_digits=%s, decimal_places=%s)" % (thisvar['totalDigits'], thisvar['fractionDigits'])
85 |
86 | elif (thisvar['type']=='Char'):
87 | if thisvar['length'] <= MAX_CHAR_FIELD_SIZE:
88 | return "CharField(null=True, blank=True, max_length=%s)" % thisvar['length']
89 | else:
90 | return "TextField(null=True, blank=True)"
91 |
92 | elif (thisvar['type']=='Text'):
93 | return "TextField(null=True, blank=True)"
94 |
95 | else:
96 | print("** No match for %s " % thisvar)
97 | return "TextField(null=True, blank=True)"
98 |
99 | def get_sqlalchemy_type(vartype):
100 |
101 | """ This is really rough, not tested, may change """
102 |
103 | try:
104 | thisvar = var_types[vartype]
105 | except KeyError:
106 | return "Text"
107 |
108 | if (thisvar['type'] =='Integer'):
109 | if thisvar['length'] < 10: # 64 bit should be forced to BigInteger
110 | return "Integer"
111 | else:
112 | return "BigInteger"
113 |
114 | elif (thisvar['type']=='Decimal'):
115 | return "Numeric(precision=%s)" % (thisvar['totalDigits'])
116 |
117 | elif (thisvar['type']=='Char'):
118 | if thisvar['length'] <= MAX_CHAR_FIELD_SIZE:
119 | return "String(length=%s)" % thisvar['length']
120 | else:
121 | return "Text"
122 |
123 | elif (thisvar['type']=='Text'):
124 | return "Text"
125 |
126 | else:
127 | print("** No match for %s " % thisvar)
128 | return "Text"
129 |
130 |
131 |
132 |
133 |
134 |
135 | if __name__ == "__main__":
136 | for key in var_types.keys():
137 | print("key %s resolve to '%s' and '%s'" % (key, get_django_type(key), get_sqlalchemy_type(key) ) )
138 |
139 |
--------------------------------------------------------------------------------
/irsdb/stream_extractor.py:
--------------------------------------------------------------------------------
1 | import unicodecsv as csv
2 | from irsx.xmlrunner import XMLRunner
3 |
4 |
5 | class StreamExtractor(object):
6 | """Write filings to csv, specified in config.py"""
7 |
8 | def __init__(self, output_streams, data_capture_dict):
9 | self.output_streams = output_streams
10 | self.data_capture_dict = data_capture_dict
11 | self.xml_runner = XMLRunner()
12 | self._init_streams()
13 |
14 |
15 | def _init_streams(self):
16 | for stream_key in self.output_streams.keys():
17 | this_stream = self.output_streams[stream_key]
18 | filename = this_stream['filename'] + ".csv"
19 | print("Initializing output stream %s" % filename)
20 | outfile = open(filename , 'wb')
21 | dw = csv.DictWriter(outfile, this_stream['headers'], extrasaction='ignore')
22 | dw.writeheader()
23 | self.output_streams[stream_key]['writer'] = dw
24 |
25 |
26 |
27 | def run_parts(self, this_capture_sked, parsed_sked, sked, taxpayer_name=""):
28 | #print("run parts %s \n %s " % (this_capture_sked, parsed_sked) )
29 | for part_key in this_capture_sked['parts'].keys():
30 | stream_key = this_capture_sked['parts'][part_key]['stream_key']
31 | this_stream = self.output_streams[stream_key]
32 | part = None
33 | try:
34 | part = parsed_sked['schedule_parts'][part_key]
35 | except KeyError:
36 | continue
37 |
38 | capture_dict = this_capture_sked['parts'][part_key]
39 |
40 | row_data = {}
41 | row_data['form'] = sked
42 | row_data['source'] = part_key
43 | row_data['taxpayer_name'] = taxpayer_name
44 |
45 |
46 | for capture_key in capture_dict.keys():
47 | if capture_key == 'stream_key':
48 | continue
49 | try:
50 | val = part[capture_key]
51 | csv_header = capture_dict[capture_key]['header']
52 | row_data[csv_header] = val
53 |
54 | except KeyError:
55 | try:
56 | default = capture_dict[capture_key]['default']
57 | csv_header = capture_dict[capture_key]['header']
58 | row_data[csv_header]=default
59 | except KeyError:
60 | #print("Key Error %s" % capture_key)
61 | pass
62 |
63 | ## Composite keys: Not implemented here.
64 |
65 | #print("row data is %s" % row_data)
66 | ## We've gone through who whole part -- write it to file
67 | this_stream['writer'].writerow(row_data)
68 |
69 |
70 |
71 | def run_groups(self, this_capture_sked, parsed_sked, sked, taxpayer_name=""):
72 | for group_key in this_capture_sked['groups'].keys():
73 | stream_key = this_capture_sked['groups'][group_key]['stream_key']
74 | this_stream = self.output_streams[stream_key]
75 | groups = None
76 | try:
77 | groups = parsed_sked['groups'][group_key]
78 | except KeyError:
79 | #print("No groups found for %s\n" % group_key)
80 | continue
81 |
82 | for group in groups:
83 | capture_dict = this_capture_sked['groups'][group_key]
84 | row_data = {}
85 | row_data['form'] = sked
86 | row_data['source'] = group_key
87 | row_data['taxpayer_name'] = taxpayer_name
88 |
89 | for capture_key in capture_dict.keys():
90 | if capture_key == 'stream_key':
91 | continue
92 | try:
93 | val = group[capture_key]
94 | csv_header = capture_dict[capture_key]['header']
95 | row_data[csv_header] = val
96 |
97 | except KeyError:
98 | try:
99 | default = capture_dict[capture_key]['default']
100 | csv_header = capture_dict[capture_key]['header']
101 | row_data[csv_header]=default
102 | except KeyError:
103 | pass
104 |
105 | ## now look for "composite keys"
106 | composite_groups = None
107 | try:
108 | composite_groups = capture_dict['composite']
109 | except KeyError:
110 | pass
111 |
112 | # composite groups are summed up from existing vars, and need a default
113 | if composite_groups:
114 | for composite_group_key in composite_groups.keys():
115 | total = 0
116 | for cg_part in composite_groups[composite_group_key].keys():
117 | try:
118 | val = group[cg_part]
119 | total += int(val)
120 | except KeyError:
121 | total += composite_groups[composite_group_key][cg_part]['default']
122 | row_data[composite_group_key] = total
123 |
124 | ## We've gone through who whole group -- write it to file
125 | this_stream['writer'].writerow(row_data)
126 |
127 | def run_filing(self, filing, taxpayer_name=""):
128 |
129 | parsed_filing = self.xml_runner.run_filing(filing)
130 | schedule_list = parsed_filing.list_schedules()
131 |
132 | if ( int(parsed_filing.get_version()[:4]) < 2013 ):
133 | print("Skipping pre-2013 schemas")
134 | return None
135 |
136 | for sked in self.data_capture_dict.keys():
137 | if sked in schedule_list:
138 | #print ("Running sked %s" % sked)
139 | parsed_skeds = parsed_filing.get_parsed_sked(sked)
140 | if parsed_skeds:
141 | parsed_sked = parsed_skeds[0]
142 | else:
143 | continue
144 |
145 | this_capture_sked = self.data_capture_dict[sked]
146 |
147 |
148 | ### Repeating Groups
149 | skip_groups = False
150 | try:
151 | this_capture_sked['groups']
152 | except KeyError:
153 | skip_groups = True
154 | if not skip_groups:
155 | self.run_groups(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name)
156 |
157 |
158 | ### Nonrepeating schedule parts
159 | skip_parts = False
160 | try:
161 | this_capture_sked['parts']
162 | except KeyError:
163 | skip_parts = True
164 | if not skip_parts:
165 | self.run_parts(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name)
166 | else:
167 | #print("missing sked %s" % sked)
168 | pass
169 |
170 |
171 |
--------------------------------------------------------------------------------
/grants.sh:
--------------------------------------------------------------------------------
1 | -- Assumes that directors was already run.
2 |
3 | -- Schedule I
4 |
5 | -- The schedule I variables are defined in the [irsx documentation](http://www.irsx.info/metadata/groups/SkdIRcpntTbl.html).
6 |
7 | -- Here's a query to a temp table
8 |
9 |
10 | DROP TABLE IF EXISTS grants;
11 |
12 | SELECT
13 | return_SkdIRcpntTbl.object_id as object_id,
14 | address_table."RtrnHdr_TxPrdEndDt",
15 | address_table."RtrnHdr_TxYr",
16 | address_table."BsnssOffcr_SgntrDt",
17 | address_table."BsnssNm_BsnssNmLn1Txt" as "Donor_BsnssNmLn1",
18 | address_table."BsnssNm_BsnssNmLn2Txt" as "Donor_BsnssNmLn2",
19 | address_table."BsnssOffcr_PrsnNm" as "Donor_BsnssOffcr_PrsnNm",
20 | address_table."BsnssOffcr_PrsnTtlTxt" as "Donor_BsnssOffcr_PrsnTtlTxt",
21 | address_table."BsnssOffcr_PhnNm" as "Donor_BsnssOffcr_PhnNm" ,
22 | address_table."BsnssOffcr_EmlAddrssTxt" as "Donor_BsnssOffcr_EmlAddrssTxt" ,
23 | address_table."USAddrss_AddrssLn1Txt" as "Donor_AddrssLn1Txt",
24 | address_table."USAddrss_AddrssLn2Txt" as "Donor_AddrssLn2Txt",
25 | address_table."USAddrss_CtyNm" as "Donor_CtyNm",
26 | address_table."USAddrss_SttAbbrvtnCd" as "Donor_SttAbbrvtnCd",
27 | address_table."USAddrss_ZIPCd" as "Donor_ZIPCd",
28 | address_table."FrgnAddrss_AddrssLn1Txt" as "Donor_FrgnAddrss_AddrssLn1Txt",
29 | address_table."FrgnAddrss_AddrssLn2Txt" as "Donor_FrgnAddrss_AddrssLn2Txt",
30 | address_table."FrgnAddrss_CtyNm" as "Donor_FrgnAddrss_CtyNm",
31 | address_table."FrgnAddrss_PrvncOrSttNm" as "Donor_PrvncOrSttNm",
32 | address_table."FrgnAddrss_CntryCd" as "Donor_CntryCd",
33 | return_SkdIRcpntTbl.ein as "Donor_EIN",
34 | '' as "RcpntPrsnNm",
35 | return_SkdIRcpntTbl."RcpntTbl_RcpntEIN" as "Rcpnt_EIN",
36 | return_SkdIRcpntTbl."RcpntBsnssNm_BsnssNmLn1Txt" as "Rcpnt_BsnssNmLn1",
37 | return_SkdIRcpntTbl."RcpntBsnssNm_BsnssNmLn2Txt" as "Rcpnt_BsnssNmLn2",
38 | trim(concat(return_SkdIRcpntTbl."USAddrss_AddrssLn1Txt", ' ', return_SkdIRcpntTbl."FrgnAddrss_AddrssLn1Txt")) as "Rcpnt_AddrssLn1",
39 | trim(concat(return_SkdIRcpntTbl."USAddrss_AddrssLn2Txt", ' ', return_SkdIRcpntTbl."FrgnAddrss_AddrssLn2Txt")) as "Rcpnt_AddrssLn2",
40 | trim(concat(return_SkdIRcpntTbl."USAddrss_CtyNm", ' ', return_SkdIRcpntTbl."FrgnAddrss_CtyNm")) as "Rcpnt_CtyNm",
41 | trim(concat(return_SkdIRcpntTbl."USAddrss_SttAbbrvtnCd", ' ', return_SkdIRcpntTbl."FrgnAddrss_PrvncOrSttNm")) as "Rcpnt_SttAbbrvtnCd",
42 | return_SkdIRcpntTbl."RcpntTbl_CshGrntAmt" as "Rcpnt_Amt",
43 | return_SkdIRcpntTbl."RcpntTbl_PrpsOfGrntTxt" as "Rcpnt_PrpsTxt",
44 | trim(concat(return_SkdIRcpntTbl."USAddrss_ZIPCd", ' ', return_SkdIRcpntTbl."FrgnAddrss_FrgnPstlCd")) as "Rcpnt_ZIPCd",
45 | '' as "Rcpnt_Rltnshp",
46 | return_SkdIRcpntTbl."RcpntTbl_IRCSctnDsc" as "Rcpnt_FndtnStts"
47 | INTO TEMPORARY TABLE grants
48 | FROM return_SkdIRcpntTbl
49 | LEFT JOIN address_table
50 | ON return_SkdIRcpntTbl.object_id = address_table.object_id
51 | AND return_SkdIRcpntTbl.ein = address_table.ein;
52 |
53 |
54 |
55 |
56 | -- Add org type data
57 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990ScheduleI' as form, grants.* into temporary table grants_types from grants left join org_types on grants.object_id = org_types.object_id and grants."Donor_EIN" = org_types.ein;
58 |
59 | -- Then copy to local with \copy:
60 |
61 | \copy grants_types to '/data/file_exports/skedigrants.csv' with csv header;
62 |
63 |
64 |
65 |
66 | --------
67 | -- Form PF Part XV "Grant or Contribution Paid During Year"
68 | --
69 | -- See the IRSX documentation for form PPF Part XV [Grant or Contribution Paid During Year](http://www.irsx.info/metadata/groups/PFGrntOrCntrbtnPdDrYr.html)
70 | --
71 | -- Note that there's also a different section for grants of contributions approved for future years that we aren't using to avoid double-counting; see [the form instructions](https://www.irs.gov/instructions/i990pf#idm140486306377296) for (not much) more info.
72 | --------
73 |
74 |
75 | DROP TABLE IF EXISTS pfgrants;
76 |
77 | SELECT
78 | return_PFGrntOrCntrbtnPdDrYr.object_id as object_id,
79 | address_table."RtrnHdr_TxPrdEndDt",
80 | address_table."RtrnHdr_TxYr",
81 | address_table."BsnssOffcr_SgntrDt",
82 | address_table."BsnssNm_BsnssNmLn1Txt" as "Donor_BsnssNmLn1",
83 | address_table."BsnssNm_BsnssNmLn2Txt" as "Donor_BsnssNmLn2",
84 | address_table."BsnssOffcr_PrsnNm" as "Donor_BsnssOffcr_PrsnNm",
85 | address_table."BsnssOffcr_PrsnTtlTxt" as "Donor_BsnssOffcr_PrsnTtlTxt",
86 | address_table."BsnssOffcr_PhnNm" as "Donor_BsnssOffcr_PhnNm" ,
87 | address_table."BsnssOffcr_EmlAddrssTxt" as "Donor_BsnssOffcr_EmlAddrssTxt" ,
88 | address_table."USAddrss_AddrssLn1Txt" as "Donor_AddrssLn1Txt",
89 | address_table."USAddrss_AddrssLn2Txt" as "Donor_AddrssLn2Txt",
90 | address_table."USAddrss_CtyNm" as "Donor_CtyNm",
91 | address_table."USAddrss_SttAbbrvtnCd" as "Donor_SttAbbrvtnCd",
92 | address_table."USAddrss_ZIPCd" as "Donor_ZIPCd",
93 | address_table."FrgnAddrss_AddrssLn1Txt" as "Donor_FrgnAddrss_AddrssLn1Txt",
94 | address_table."FrgnAddrss_AddrssLn2Txt" as "Donor_FrgnAddrss_AddrssLn2Txt",
95 | address_table."FrgnAddrss_CtyNm" as "Donor_FrgnAddrss_CtyNm",
96 | address_table."FrgnAddrss_PrvncOrSttNm" as "Donor_PrvncOrSttNm",
97 | address_table."FrgnAddrss_CntryCd" as "Donor_CntryCd",
98 | return_PFGrntOrCntrbtnPdDrYr.ein as "Donor_EIN",
99 | '' as "Rcpnt_EIN",
100 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntPrsnNm" as "RcpntPrsnNm",
101 | return_PFGrntOrCntrbtnPdDrYr."RcpntBsnssNm_BsnssNmLn1Txt" as "Rcpnt_BsnssNmLn1",
102 | return_PFGrntOrCntrbtnPdDrYr."RcpntBsnssNm_BsnssNmLn2Txt" as "Rcpnt_BsnssNmLn2",
103 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_AddrssLn1Txt", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_AddrssLn1Txt")) as "Rcpnt_AddrssLn1",
104 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_AddrssLn2Txt", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_AddrssLn2Txt")) as "Rcpnt_AddrssLn2",
105 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_CtyNm", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_CtyNm")) as "Rcpnt_CtyNm",
106 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_SttAbbrvtnCd", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_PrvncOrSttNm")) as "Rcpnt_SttAbbrvtnCd",
107 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_Amt" as "Rcpnt_Amt",
108 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_GrntOrCntrbtnPrpsTxt" as "Rcpnt_PrpsTxt",
109 | trim(concat(return_PFGrntOrCntrbtnPdDrYr."RcpntUSAddrss_ZIPCd", ' ', return_PFGrntOrCntrbtnPdDrYr."RcpntFrgnAddrss_FrgnPstlCd")) as "Rcpnt_ZIPCd",
110 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntRltnshpTxt" as "Rcpnt_Rltnshp",
111 | return_PFGrntOrCntrbtnPdDrYr."GrntOrCntrbtnPdDrYr_RcpntFndtnSttsTxt" as "Rcpnt_FndtnStts"
112 | INTO TEMPORARY TABLE pfgrants
113 | FROM return_PFGrntOrCntrbtnPdDrYr
114 | LEFT JOIN address_table ON return_PFGrntOrCntrbtnPdDrYr.object_id = address_table.object_id
115 | AND return_PFGrntOrCntrbtnPdDrYr.ein = address_table.ein;
116 |
117 | -- Add org type data
118 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990PF' as form, pfgrants.* into temporary table pfgrants_types from pfgrants left join org_types on pfgrants.object_id = org_types.object_id and pfgrants."Donor_EIN" = org_types.ein;
119 |
120 | -- Copy to local
121 |
122 | \copy pfgrants_types to '/data/file_exports/pfgrants.csv' with csv header;
123 |
--------------------------------------------------------------------------------
/irsdb/return/management/commands/load_filings_multithreaded.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import requests
4 |
5 | from datetime import datetime
6 |
7 | from django.core.management.base import BaseCommand
8 | from django.conf import settings
9 |
10 | from filing.models import Filing
11 | from schemas.model_accumulator import Accumulator
12 | from irsx.settings import INDEX_DIRECTORY
13 | from irsx.file_utils import stream_download
14 | from irsx.xmlrunner import XMLRunner
15 |
16 | from queue import Queue
17 | from threading import Thread
18 | from django.db import connection
19 |
20 | # this is how many we process; there's a separate batch size
21 | # in model accumulator for how many are processed
22 | BATCH_SIZE = 1000
23 |
24 | class DownloadWorker(Thread):
25 | def add_arguments(self, parser):
26 | # Positional arguments
27 | parser.add_argument('year', nargs=1, type=int)
28 |
29 | def setup(self):
30 | # get an XMLRunner -- this is what actually does the parsing
31 | self.xml_runner = XMLRunner()
32 | self.accumulator = Accumulator()
33 |
34 | def process_sked(self, sked):
35 | """ Enter just one schedule """
36 | #print("Processing schedule %s" % sked['schedule_name'])
37 | for part in sked['schedule_parts'].keys():
38 | partname = part
39 | partdata = sked['schedule_parts'][part]
40 | #print("part %s %s" % (partname, partdata))
41 |
42 | self.accumulator.add_model(partname, partdata)
43 |
44 | for groupname in sked['groups'].keys():
45 | for groupdata in sked['groups'][groupname]:
46 | #print("group %s %s" % (groupname, groupdata) )
47 | self.accumulator.add_model(groupname, groupdata)
48 |
49 | def run_filing(self, filing):
50 | # print (filing)
51 |
52 | object_id = filing.object_id
53 |
54 | parsed_filing = self.xml_runner.run_filing(object_id)
55 | if not parsed_filing:
56 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (
57 | filing, metadata_row))
58 | return None
59 |
60 | schedule_list = parsed_filing.list_schedules()
61 | # print("sked list is %s" % schedule_list)
62 |
63 | result = parsed_filing.get_result()
64 |
65 | keyerrors = parsed_filing.get_keyerrors()
66 |
67 | if keyerrors:
68 | # If we find keyerrors--xpaths that are missing from our spec, note it
69 | print("Key error %s")
70 | has_keyerrors = len(keyerrors) > 0
71 | print("keyerror: %s" % keyerrors)
72 | filing.error_details = str(keyerrors)
73 | filing.key_error_count = len(keyerrors)
74 | filing.is_error = has_keyerrors
75 | filing.save()
76 |
77 | if result:
78 | for sked in result:
79 | self.process_sked(sked)
80 | else:
81 | print("Filing not parsed %s " % object_id)
82 |
83 |
84 | def __init__(self, queue):
85 | Thread.__init__(self)
86 | self.queue = queue
87 |
88 | def run(self):
89 | self.xml_runner = XMLRunner()
90 | self.accumulator = Accumulator()
91 | while True:
92 | filing = self.queue.get()
93 | self.run_filing(filing)
94 | self.queue.task_done()
95 | connection.close()
96 |
97 | class Command(BaseCommand):
98 | help = '''
99 | Enter the filings, one by one.
100 | Loading is done in bulk, though status on the filings is updated one at a time.
101 |
102 | '''
103 |
104 | def add_arguments(self, parser):
105 | # Positional arguments
106 | parser.add_argument('year', nargs=1, type=int)
107 |
108 | def setup(self):
109 | # get an XMLRunner -- this is what actually does the parsing
110 | self.xml_runner = XMLRunner()
111 | self.accumulator = Accumulator()
112 |
113 |
114 | def process_sked(self, sked):
115 | """ Enter just one schedule """
116 | #print("Processing schedule %s" % sked['schedule_name'])
117 | for part in sked['schedule_parts'].keys():
118 | partname = part
119 | partdata = sked['schedule_parts'][part]
120 | #print("part %s %s" % (partname, partdata))
121 |
122 | self.accumulator.add_model(partname, partdata)
123 |
124 | for groupname in sked['groups'].keys():
125 | for groupdata in sked['groups'][groupname]:
126 | #print("group %s %s" % (groupname, groupdata) )
127 | self.accumulator.add_model(groupname, groupdata)
128 |
129 |
130 | def run_filing(self, filing):
131 |
132 | object_id = filing.object_id
133 |
134 | parsed_filing = self.xml_runner.run_filing(object_id)
135 | if not parsed_filing:
136 | print("Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row))
137 | return None
138 |
139 | schedule_list = parsed_filing.list_schedules()
140 | #print("sked list is %s" % schedule_list)
141 |
142 | result = parsed_filing.get_result()
143 |
144 | keyerrors = parsed_filing.get_keyerrors()
145 |
146 | if keyerrors:
147 | # If we find keyerrors--xpaths that are missing from our spec, note it
148 | print("Key error %s")
149 | has_keyerrors = len(keyerrors) > 0
150 | print("keyerror: %s" % keyerrors)
151 | filing.error_details = str(keyerrors)
152 | filing.key_error_count = len(keyerrors)
153 | filing.is_error = has_keyerrors
154 | filing.save()
155 |
156 | if result:
157 | for sked in result:
158 | self.process_sked(sked)
159 | else:
160 | print("Filing not parsed %s " % object_id)
161 |
162 | def handle(self, *args, **options):
163 |
164 | year = int(options['year'][0])
165 | if year not in [2014, 2015, 2016, 2017, 2018]:
166 | raise RuntimeError("Illegal year `%s`. Please enter a year between 2014 and 2018" % year)
167 |
168 | print("Running filings during year %s" % year)
169 | self.setup()
170 |
171 | process_count = 0
172 |
173 | queue = Queue()
174 | # Create 8 worker threads
175 | for x in range(8):
176 | worker = DownloadWorker(queue)
177 | # Setting daemon to True will let the main thread exit even though the workers are blocking
178 | worker.daemon = True
179 | worker.start()
180 |
181 |
182 | while True:
183 | filings=Filing.objects.filter(submission_year=year).exclude(parse_complete=True)[:100]
184 | if not filings:
185 | print("Done")
186 | break
187 |
188 | object_id_list = [f.object_id for f in filings]
189 |
190 | # record that processing has begun
191 | Filing.objects.filter(object_id__in=object_id_list).update(parse_started=True)
192 |
193 | for filing in filings:
194 | # print(filing)
195 | queue.put(filing)
196 | process_count += 1
197 | if process_count % 1000 == 0:
198 | print("Handled %s filings" % process_count)
199 |
200 | queue.join()
201 |
202 | # commit anything that's left
203 | self.accumulator.commit_all()
204 | # record that all are complete
205 | Filing.objects.filter(object_id__in=object_id_list).update(process_time=datetime.now(), parse_complete=True)
206 | print("Processed a total of %s filings" % process_count)
207 | # Causes the main thread to wait for the queue to finish processing all the tasks
208 |
209 |
210 |
211 |
--------------------------------------------------------------------------------
/irsdb/metadata/management/commands/generate_schemas_from_metadata.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from django.core.management.base import BaseCommand
4 | from django.conf import settings
5 |
6 | from metadata.models import Variable, Group, SchedulePart
7 | from schemas.documentation_utils import most_recent, debracket
8 | from schemas.type_utils import get_django_type, get_sqlalchemy_type
9 |
10 | GENERATED_MODELS_DIR = settings.GENERATED_MODELS_DIR
11 | KNOWN_SCHEDULES = settings.KNOWN_SCHEDULES
12 | CANONICAL_VERSION = '2016v3.0'
13 | soft_tab = ' '
14 |
15 | class Command(BaseCommand):
16 | help = """ Generate django model file.
17 | Hard overwrites the default file.
18 |
19 | SQLAlchemy in development as a CLI option ( -sqlalchemy )
20 | Pretty rough at this point
21 |
22 | """
23 |
24 |
25 | def add_arguments(self, parser):
26 | parser.add_argument('--sqlalchemy', action='store_true')
27 |
28 | parser.add_argument(
29 | "--schedule",
30 | choices=KNOWN_SCHEDULES,
31 | default=None,
32 | help='Get only that schedule'
33 | )
34 |
35 | def write_model_top(self, sked_name, full_name, parent_sked_name, repeating_group_part=None):
36 | print("Handing part %s %s" % (sked_name, full_name))
37 |
38 | if self.run_django:
39 |
40 | result = "\n#######\n#\n# %s - %s\n" % (parent_sked_name, full_name)
41 | if repeating_group_part:
42 | result += "# A repeating structure from %s\n" % (repeating_group_part)
43 | result += "#\n#######\n"
44 | ## write the start of the first group:
45 | result += "\nclass %s(models.Model):\n" % sked_name
46 | result += soft_tab + "object_id = models.CharField(max_length=31, blank=True, null=True, help_text=\"unique xml return id\")\n"
47 | result += soft_tab + "ein = models.CharField(max_length=15, blank=True, null=True, help_text=\"filer EIN\")\n"
48 | if parent_sked_name=='IRS990ScheduleK':
49 | # It's not clear what the max length is; Return.xsd is unclear
50 | result += soft_tab + "documentId = models.TextField(blank=True, null=True, help_text=\"documentID attribute\")"
51 |
52 |
53 | return result
54 |
55 | elif self.run_sqlalchemy:
56 |
57 | result = "\n#######\n#\n# %s - %s\n" % (parent_sked_name, full_name)
58 | if repeating_group_part:
59 | result += "# A repeating structure from %s\n" % (repeating_group_part)
60 | result += "#\n#######\n"
61 | ## write the start of the first group:
62 | result += "\nclass %s(Base):\n%s__tablename__='%s'\n" % (sked_name,soft_tab, sked_name)
63 | result += soft_tab + "object_id = Column(String(31))\n"
64 | result += soft_tab + "ein = Column(String(15))\n"
65 | if parent_sked_name=='IRS990ScheduleK':
66 | result += soft_tab + "documentId = Column(String(15))\n"
67 |
68 | result += soft_tab + "id = Column(Integer, primary_key=True)\n" # Add a primary key explicitly
69 |
70 | return result
71 |
72 | def write_top_matter(self):
73 | if self.run_django:
74 | self.outfile.write("from django.db import models\n")
75 | elif self.run_sqlalchemy:
76 | self.outfile.write("from sqlalchemy import Column, Integer, String, BigInteger, Text, Numeric\n")
77 | self.outfile.write("from sqlalchemy.ext.declarative import declarative_base\n\n")
78 | self.outfile.write("Base = declarative_base()\n\n")
79 |
80 |
81 | def write_variable(self, variable):
82 | """
83 | We fallback to a text field, but we expect the types to be filled in where missing
84 | """
85 | print("Write variable name %s type %s" % (variable.db_name, variable.db_type))
86 | if self.run_django:
87 | variable_output = get_django_type(variable.irs_type)
88 | result = "\n" + soft_tab + "%s = models.%s" % (variable.db_name, variable_output)
89 |
90 | elif self.run_sqlalchemy:
91 | variable_output = get_sqlalchemy_type(variable.irs_type)
92 | result = "\n" + soft_tab + "%s = Column(%s)" % (variable.db_name, variable_output)
93 |
94 | # add newline and documentation regardless of where it's going
95 | result += "\n" + soft_tab + "#"
96 | if variable.line_number:
97 | result += " Line number: %s " % most_recent(debracket(variable.line_number))
98 | if variable.description:
99 | result += " Description: %s " % most_recent(debracket(variable.description))
100 | result += " most recent xpath: %s \n" % variable.xpath
101 |
102 | return result
103 |
104 | def write_sked(self, schedule):
105 | print("Handling schedule %s" % (schedule))
106 |
107 |
108 | form_parts = SchedulePart.objects.filter(parent_sked=schedule).order_by('ordering')
109 | for form_part in form_parts:
110 |
111 | model_top = self.write_model_top(
112 | form_part.parent_sked_part,
113 | form_part.part_name,
114 | schedule
115 | )
116 |
117 | variables_in_this_part = Variable.objects.filter(
118 | parent_sked_part=form_part.parent_sked_part,
119 | version_end__in=['','2016', '2017', '2018'],
120 | ).exclude(in_a_group=True).order_by('ordering',)
121 | if variables_in_this_part:
122 | # only write it if it contains anything
123 | self.outfile.write(model_top)
124 | print(model_top)
125 |
126 | for variable in variables_in_this_part:
127 | this_var = self.write_variable(variable)
128 | print(this_var)
129 | self.outfile.write(this_var)
130 |
131 |
132 |
133 | groups_in_this_sked = Group.objects.filter(
134 | parent_sked=schedule,
135 | version_end='',
136 | ).order_by('ordering',)
137 |
138 | for group in groups_in_this_sked:
139 | name = group.db_name
140 | if group.description:
141 | name += " - " + group.description
142 | model_top = self.write_model_top(
143 | group.db_name,
144 | name,
145 | schedule,
146 | repeating_group_part=group.parent_sked_part
147 | )
148 |
149 | variables_in_this_group = Variable.objects.filter(
150 | db_table=group.db_name,
151 | version_end__in=['','2016', '2017', '2018'],
152 | ).order_by('ordering',)
153 |
154 | if variables_in_this_group:
155 | # only write it if it contains anything
156 | self.outfile.write(model_top)
157 | print(model_top)
158 |
159 | for variable in variables_in_this_group:
160 | this_var = self.write_variable(variable)
161 | print(this_var)
162 | self.outfile.write(this_var)
163 |
164 |
165 | def handle(self, *args, **options):
166 | print(options)
167 | self.run_sqlalchemy = options['sqlalchemy']
168 | self.run_django = not self.run_sqlalchemy # Only run one or the other.
169 |
170 | file_output = os.path.join(GENERATED_MODELS_DIR, "django_models_auto.py")
171 | if self.run_sqlalchemy:
172 | file_output = os.path.join(GENERATED_MODELS_DIR, "sqlalchemy_models_auto.py")
173 | self.outfile = open(file_output, 'w')
174 |
175 | self.write_top_matter()
176 |
177 | schedulename = options.get('schedule')
178 | if schedulename:
179 | print("Handling schedule %s" % schedulename)
180 | self.write_sked(schedulename)
181 | else:
182 | for schedulename in KNOWN_SCHEDULES:
183 | print("Handling schedule %s" % schedulename)
184 | self.write_sked(schedulename)
185 |
186 |
--------------------------------------------------------------------------------
/irsdb/return/sql/delete_all_return.sql:
--------------------------------------------------------------------------------
1 | ------------
2 | --
3 | -- !! Removes every entry from all of the return tables. Use with caution !!
4 | --
5 | -- Also resets the tracking fields in Filing so that each starts "fresh"
6 | -- and load_files.py will try to reload these.
7 | -- (If parse_complete is not reset, load_files will think it's already "done" and skip it)
8 | ------------
9 |
10 | delete from return_cntrctrcmpnstn;
11 | delete from return_ez_part_0;
12 | delete from return_ez_part_i;
13 | delete from return_ez_part_ii;
14 | delete from return_ez_part_iii;
15 | delete from return_ez_part_iv;
16 | delete from return_ez_part_v;
17 | delete from return_ez_part_vi;
18 | delete from return_ezcmpnstnhghstpdempl;
19 | delete from return_ezcmpnstnofhghstpdcntrct;
20 | delete from return_ezfrgnfnnclaccntcntrycd;
21 | delete from return_ezfrgnoffccntrycd;
22 | delete from return_ezoffcrdrctrtrstempl;
23 | delete from return_ezprgrmsrvcaccmplshmnt;
24 | delete from return_ezspclcndtndsc;
25 | delete from return_ezsttswhrcpyofrtrnisfldcd;
26 | delete from return_frgncntrycd;
27 | delete from return_frm990prtviisctna;
28 | delete from return_othrexpnss;
29 | delete from return_othrrvnmsc;
30 | delete from return_part_0;
31 | delete from return_part_i;
32 | delete from return_part_iii;
33 | delete from return_part_iv;
34 | delete from return_part_ix;
35 | delete from return_part_v;
36 | delete from return_part_vi;
37 | delete from return_part_vii;
38 | delete from return_part_viii;
39 | delete from return_part_x;
40 | delete from return_part_xi;
41 | delete from return_part_xii;
42 | delete from return_pf_part_0;
43 | delete from return_pf_part_i;
44 | delete from return_pf_part_ii;
45 | delete from return_pf_part_iii;
46 | delete from return_pf_part_iv;
47 | delete from return_pf_part_ixa;
48 | delete from return_pf_part_ixb;
49 | delete from return_pf_part_v;
50 | delete from return_pf_part_vi;
51 | delete from return_pf_part_viia;
52 | delete from return_pf_part_viib;
53 | delete from return_pf_part_viii;
54 | delete from return_pf_part_x;
55 | delete from return_pf_part_xi;
56 | delete from return_pf_part_xii;
57 | delete from return_pf_part_xiii;
58 | delete from return_pf_part_xiv;
59 | delete from return_pf_part_xv;
60 | delete from return_pf_part_xvia;
61 | delete from return_pf_part_xvib;
62 | delete from return_pf_part_xvii;
63 | delete from return_pfapplctnsbmssninf;
64 | delete from return_pfcmpnstnhghstpdempl;
65 | delete from return_pfcmpnstnofhghstpdcntrct;
66 | delete from return_pfcntrbtngmngrnm;
67 | delete from return_pfcpgnslsstxinvstincm;
68 | delete from return_pffrgncntrycd;
69 | delete from return_pfgrntorcntrapprvfrft;
70 | delete from return_pfgrntorcntrbtnpddryr;
71 | delete from return_pfoffcrdrtrstkyempl;
72 | delete from return_pforgrprtorrgstrsttcd;
73 | delete from return_pfothrrvndscrbd;
74 | delete from return_pfprgrmsrvcrvprtvii;
75 | delete from return_pfrlnofactytaccmofexmptprps;
76 | delete from return_pfrltnshpskddtl;
77 | delete from return_pfshrhldrmngrnm;
78 | delete from return_pfspclcndtndsc;
79 | delete from return_pftrnsfrskddtl;
80 | delete from return_prgrmsrvcrvn;
81 | delete from return_prgsrvcaccmactyothr;
82 | delete from return_returnheader990x_part_i;
83 | delete from return_skdaagrcltrlnmandaddrss;
84 | delete from return_skdafrm990skdaprtvi;
85 | delete from return_skdahsptlnmandaddrss;
86 | delete from return_skdaspprtdorginfrmtn;
87 | delete from return_skdbchrtblcntrbtnsdtl;
88 | delete from return_skdbcntrbtrinfrmtn;
89 | delete from return_skdbnncshprprtycntrbtn;
90 | delete from return_skdcsctn527pltclorg;
91 | delete from return_skdcspplmntlinfrmtndtl;
92 | delete from return_skddinvstprgrmrltdorg;
93 | delete from return_skddothrasstsorg;
94 | delete from return_skddothrlbltsorg;
95 | delete from return_skddothrscrts;
96 | delete from return_skddspplmntlinfrmtndtl;
97 | delete from return_skdespplmntlinfrmtndtl;
98 | delete from return_skdfaccntactvtsotsdus;
99 | delete from return_skdffrgnindvdlsgrnts;
100 | delete from return_skdfgrntstorgotsdus;
101 | delete from return_skdfspplmntlinfrmtndtl;
102 | delete from return_skdgfndrsractvtyinf;
103 | delete from return_skdglcnsdsttscd;
104 | delete from return_skdgspplmntlinfrmtndtl;
105 | delete from return_skdgsttswhrgmngcndctdcd;
106 | delete from return_skdhhsptlfclts;
107 | delete from return_skdhhsptlfcltyplcsprctc;
108 | delete from return_skdhmngmntcandjntvntrs;
109 | delete from return_skdhothhlthcrfclts;
110 | delete from return_skdhspplmntlinfrmtn;
111 | delete from return_skdhspplmntlinfrmtndtl;
112 | delete from return_skdigrntsothrassttindvinus;
113 | delete from return_skdircpnttbl;
114 | delete from return_skdispplmntlinfrmtndtl;
115 | delete from return_skdjrltdorgoffcrtrstkyempl;
116 | delete from return_skdjspplmntlinfrmtndtl;
117 | delete from return_skdkprcdrscrrctvactn;
118 | delete from return_skdkspplmntlinfrmtndtl;
119 | delete from return_skdktxexmptbndsarbtrg;
120 | delete from return_skdktxexmptbndsisss;
121 | delete from return_skdktxexmptbndsprcds;
122 | delete from return_skdktxexmptbndsprvtbsus;
123 | delete from return_skdlbstrinvlvintrstdprsn;
124 | delete from return_skdldsqlfdprsnexbnfttr;
125 | delete from return_skdlgrntasstbnftintrstdprsn;
126 | delete from return_skdllnsbtwnorgintrstdprsn;
127 | delete from return_skdlspplmntlinfrmtndtl;
128 | delete from return_skdmothrnncshcntrtbl;
129 | delete from return_skdmspplmntlinfrmtndtl;
130 | delete from return_skdndspstnofasstsdtl;
131 | delete from return_skdnlqdtnofasstsdtl;
132 | delete from return_skdnspplmntlinfrmtndtl;
133 | delete from return_skdospplmntlinfrmtndtl;
134 | delete from return_skdriddsrgrddentts;
135 | delete from return_skdridrltdorgtxblcrptr;
136 | delete from return_skdridrltdorgtxblprtnrshp;
137 | delete from return_skdridrltdtxexmptorg;
138 | delete from return_skdrspplmntlinfrmtndtl;
139 | delete from return_skdrtrnsctnsrltdorg;
140 | delete from return_skdrunrltdorgtxblprtnrshp;
141 | delete from return_skeda_part_i;
142 | delete from return_skeda_part_ii;
143 | delete from return_skeda_part_iii;
144 | delete from return_skeda_part_iv;
145 | delete from return_skeda_part_v;
146 | delete from return_skeda_part_vi;
147 | delete from return_skedb_part_0;
148 | delete from return_skedb_part_ii;
149 | delete from return_skedc_part_0;
150 | delete from return_skedc_part_iia;
151 | delete from return_skedc_part_iib;
152 | delete from return_skedc_part_iiia;
153 | delete from return_skedc_part_iiib;
154 | delete from return_skedd_part_i;
155 | delete from return_skedd_part_ii;
156 | delete from return_skedd_part_iii;
157 | delete from return_skedd_part_iv;
158 | delete from return_skedd_part_ix;
159 | delete from return_skedd_part_v;
160 | delete from return_skedd_part_vi;
161 | delete from return_skedd_part_vii;
162 | delete from return_skedd_part_viii;
163 | delete from return_skedd_part_x;
164 | delete from return_skedd_part_xi;
165 | delete from return_skedd_part_xii;
166 | delete from return_skede_part_i;
167 | delete from return_skedf_part_i;
168 | delete from return_skedf_part_ii;
169 | delete from return_skedf_part_iv;
170 | delete from return_skedg_part_i;
171 | delete from return_skedg_part_ii;
172 | delete from return_skedg_part_iii;
173 | delete from return_skedh_part_i;
174 | delete from return_skedh_part_ii;
175 | delete from return_skedh_part_iii;
176 | delete from return_skedh_part_va;
177 | delete from return_skedh_part_vd;
178 | delete from return_skedi_part_i;
179 | delete from return_skedi_part_ii;
180 | delete from return_skedj_part_i;
181 | delete from return_skedl_part_i;
182 | delete from return_skedl_part_ii;
183 | delete from return_skedm_part_i;
184 | delete from return_skedn_part_i;
185 | delete from return_skedn_part_ii;
186 | delete from return_skedr_part_v;
187 | delete from return_spclcndtndsc;
188 | delete from return_sttswhrcpyofrtrnisfldcd;
189 |
190 |
191 | -- Now reset tracking error fields.
192 |
193 |
194 | update filing_filing set parse_started=False where parse_started = True;
195 | update filing_filing set parse_complete=False where parse_complete = True;
196 | update filing_filing set process_time=Null where not process_time is Null;
197 |
198 | update filing_filing set is_error=False where is_error = True;
199 | update filing_filing set key_error_count=Null where not key_error_count is Null;
200 | update filing_filing set error_details =Null where not error_details is Null;
201 |
--------------------------------------------------------------------------------
/irsdb/metadata/views.py:
--------------------------------------------------------------------------------
1 | from irsx._version import __version__ as irsx_version
2 | from datetime import datetime
3 | from django.shortcuts import get_object_or_404, render
4 | from django.conf import settings
5 | from django.db import connection
6 | from django.template.loader import render_to_string
7 |
8 | from .models import Variable, LineNumber, Description, SchedulePart, Group
9 |
10 |
11 |
12 | KNOWN_SCHEDULES = settings.KNOWN_SCHEDULES
13 |
14 |
15 | # We're too low rent to install django-bakery
16 | # in the future we should use CBV's and use it
17 | # This app is odd in that it tries to only consume
18 | # published metadata .csvs, hence oddness in the models
19 | # which reflect the files, rather than the data
20 |
21 | # The base of the file system
22 | try:
23 | FILE_SYSTEM_BASE = settings.FILE_SYSTEM_BASE
24 | except ImportError:
25 | FILE_SYSTEM_BASE = ''
26 | # When set to true will 'cache' a baked version of the page
27 | # To run a full bake, run a 'scrape' of every page that needs update
28 | # Then deploy files
29 | # Any new static files need to be moved into place, this is just a hack
30 |
31 | BAKE_OUT = True
32 |
33 | def bake(request, template, context, filepath=None):
34 | path = request.META['PATH_INFO']
35 | if filepath:
36 | path = filepath
37 | full_path = FILE_SYSTEM_BASE + path # should be an os.join process here
38 |
39 |
40 | print("Bake with full_path = %s" % full_path)
41 | with open(full_path, "w") as f:
42 | f.write(render_to_string(template, context))
43 |
44 |
45 | def show_xpath(request, xpath):
46 | """
47 | Show a single xpath
48 | """
49 | raw_xpath = xpath
50 | xpath = xpath.replace("-","/")
51 |
52 | print("Xpath is '%s'" % xpath)
53 | this_variable = get_object_or_404(Variable, xpath=xpath)
54 | line_numbers = LineNumber.objects.filter(xpath=xpath)
55 | descriptions = Description.objects.filter(xpath=xpath)
56 | if len(line_numbers)<2:
57 | line_numbers = None
58 | if len(descriptions)<2:
59 | descriptions = None
60 |
61 | context = {
62 | 'this_variable': this_variable,
63 | 'line_numbers':line_numbers,
64 | 'descriptions':descriptions
65 | }
66 | template = 'metadata/xpath.html'
67 |
68 | if BAKE_OUT:
69 | filepath = "/metadata/xpath/" + raw_xpath + ".html"
70 | bake(request, template, context, filepath=filepath)
71 |
72 | return render(request, template, context)
73 |
74 | def show_about(request):
75 | context = {
76 | 'version':irsx_version,
77 | 'update':datetime.now(),
78 | }
79 | template = 'metadata/about.html'
80 |
81 | if BAKE_OUT:
82 | bake(request, template, context)
83 | return render(request, template, context)
84 |
85 | def show_variable(request, db_name, variable_name):
86 | """
87 | Show a single variable
88 | """
89 | print("Variable is '%s'" % variable_name)
90 | variables = Variable.objects.filter(db_table=db_name, db_name=variable_name)
91 | this_variable = variables[0]
92 | xpaths = variables.values_list('xpath', 'version_start', 'version_end')
93 | result_xpaths = []
94 | for xpath in xpaths:
95 | result_xpaths.append({
96 | 'xpath':xpath[0],
97 | 'url':"/metadata/xpath/" + xpath[0].replace("/","-") + ".html",
98 | 'version_start':xpath[1],
99 | 'version_end':xpath[2],
100 | })
101 |
102 | print("xpaths are %s" % result_xpaths)
103 |
104 |
105 | this_variable = variables[0]
106 | context = {
107 | 'this_variable': this_variable,
108 | 'xpaths':result_xpaths
109 | }
110 | template = 'metadata/variable.html'
111 |
112 | if BAKE_OUT:
113 | filepath = this_variable.get_absolute_url()
114 | bake(request, template, context)
115 | return render(request, template, context)
116 |
117 | def show_part(request, part):
118 | this_part = get_object_or_404(SchedulePart, parent_sked_part=part)
119 | related_groups = Group.objects.filter(parent_sked_part=part)
120 | groups = []
121 | group_names = []
122 | for group in related_groups:
123 | if group.db_name not in group_names:
124 | groups.append({
125 | 'db_name':group.db_name,
126 | 'get_absolute_url':group.get_absolute_url()
127 | })
128 | group_names.append(group.db_name)
129 |
130 | variables = Variable.objects.filter(parent_sked_part=part, in_a_group=False).exclude(version_end__in=['2013', '2014', '2015']).order_by('line_number', 'ordering')
131 | context = {
132 | 'this_part': this_part,
133 | 'variables':variables,
134 | 'related_groups':groups,
135 | }
136 | template = 'metadata/part.html'
137 |
138 | if BAKE_OUT:
139 | bake(request, template, context)
140 | return render(request, template, context)
141 |
142 | def show_group(request, group):
143 | this_group = Group.objects.filter(db_name=group)[0]
144 | variables = Variable.objects.filter(db_table=group).exclude(version_end__in=['2013', '2014', '2015']).order_by('line_number', 'ordering')
145 |
146 | template = 'metadata/group.html'
147 | context = {
148 | 'this_group': this_group,
149 | 'variables':variables,
150 | }
151 |
152 | if BAKE_OUT:
153 | bake(request, template, context)
154 | return render(request, template,context )
155 |
156 | def join_groups_to_parts():
157 | with connection.cursor() as cursor:
158 | # Sigh.
159 | RAW_SQL = """
160 | SELECT DISTINCT
161 | metadata_group.parent_sked,
162 | metadata_group.parent_sked_part,
163 | metadata_group.db_name,
164 | metadata_schedulepart.ordering
165 | FROM
166 | metadata_group
167 | LEFT JOIN
168 | metadata_schedulepart
169 | ON metadata_group.parent_sked_part = metadata_schedulepart.parent_sked_part
170 | AND metadata_group.parent_sked = metadata_schedulepart.parent_sked
171 | ORDER BY
172 | metadata_group.parent_sked,
173 | metadata_schedulepart.ordering;
174 | """
175 | cursor.execute(RAW_SQL)
176 | rows = cursor.fetchall()
177 | result_obj = []
178 | for row in rows:
179 | result_obj.append({
180 | 'parent_sked':row[0],
181 | 'parent_sked_part':row[1],
182 | 'group_name':row[2],
183 | })
184 | return result_obj
185 |
186 |
187 | def show_forms(request):
188 | """
189 | Show all form parts - this is gnarly and should be baked / cached
190 | """
191 | parts = SchedulePart.objects.all().order_by('parent_sked','ordering')
192 | form_hash = {}
193 | part_hash = {}
194 |
195 | # Sorta laboriously rebuild data structure from metadata.csv files. They weren't designed for this!
196 | for schedule in KNOWN_SCHEDULES:
197 | form_hash[schedule] = {'schedule_name':schedule, 'parts':[]}
198 | for part in parts:
199 | try:
200 | form_hash[part.parent_sked]['parts'].append(part)
201 | except KeyError:
202 | form_hash[part.parent_sked] = {'schedule_name':part.parent_sked, 'parts':[part]}
203 |
204 | sked_part_hash = {}
205 | joined_groups = join_groups_to_parts()
206 | for jg in joined_groups:
207 | try:
208 | sked_part_hash[jg['parent_sked_part']]['groups'].append(jg['group_name'])
209 | except KeyError:
210 | sked_part_hash[jg['parent_sked_part']] = {'groups':[jg['group_name']]}
211 |
212 | return_array = []
213 | for fkey in form_hash.keys():
214 | this_data_obj = {'sked_name':fkey, 'parts':[]}
215 | for i, part in enumerate(form_hash[fkey]['parts']):
216 | part_obj = {}
217 | part_obj['part'] = part
218 | part_obj['groups'] = []
219 | part_obj['name'] = part.parent_sked_part
220 | try:
221 | groups = sked_part_hash[part.parent_sked_part]['groups']
222 | part_obj['groups'] = groups
223 | except KeyError:
224 | part_obj['groups'] = ''
225 | this_data_obj['parts'].append(part_obj)
226 | return_array.append(this_data_obj)
227 |
228 | print(return_array)
229 | template = 'metadata/forms.html'
230 | context = {
231 | 'forms':return_array
232 | }
233 | if BAKE_OUT:
234 | bake(request, template, context)
235 | return render(request, template, context)
--------------------------------------------------------------------------------
/contractors.sh:
--------------------------------------------------------------------------------
1 | -- Contractor compensation
2 |
3 |
4 | -- Form 990:
5 |
6 |
7 | DROP TABLE IF EXISTS contractor_comp_990;
8 |
9 | SELECT
10 | address_table.ein,
11 | address_table.object_id,
12 | address_table."RtrnHdr_TxPrdEndDt",
13 | address_table."RtrnHdr_TxYr",
14 | address_table."BsnssOffcr_SgntrDt",
15 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
16 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
17 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
18 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt",
19 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" ,
20 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" ,
21 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
22 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
23 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
24 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
25 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
26 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
27 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
28 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
29 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
30 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
31 | return_CntrctrCmpnstn."CntrctrNm_PrsnNm" as "CntrctrNm_PrsnNm",
32 | trim(concat(return_CntrctrCmpnstn."BsnssNm_BsnssNmLn1Txt", ' ', return_CntrctrCmpnstn."BsnssNm_BsnssNmLn2Txt")) as "Cntrctr_Business",
33 | trim(concat(return_CntrctrCmpnstn."USAddrss_AddrssLn1Txt", ' ', return_CntrctrCmpnstn."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1",
34 | trim(concat(return_CntrctrCmpnstn."USAddrss_AddrssLn2Txt", ' ', return_CntrctrCmpnstn."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2",
35 | trim(concat(return_CntrctrCmpnstn."USAddrss_CtyNm", ' ', return_CntrctrCmpnstn."FrgnAddrss_CtyNm")) as "Cntrctr_City",
36 | trim(concat(return_CntrctrCmpnstn."USAddrss_ZIPCd", ' ', return_CntrctrCmpnstn."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP",
37 | trim(concat(return_CntrctrCmpnstn."USAddrss_SttAbbrvtnCd" , ' ', return_CntrctrCmpnstn."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State",
38 | return_CntrctrCmpnstn."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd",
39 | return_CntrctrCmpnstn."CntrctrCmpnstn_SrvcsDsc" as "SrvcsDsc",
40 | return_CntrctrCmpnstn."CntrctrCmpnstn_CmpnstnAmt" as "CmpnstnAmt"
41 | INTO TEMPORARY TABLE contractor_comp_990
42 | FROM return_CntrctrCmpnstn
43 | LEFT JOIN address_table ON return_CntrctrCmpnstn.object_id = address_table.object_id
44 | AND return_CntrctrCmpnstn.ein = address_table.ein;
45 |
46 |
47 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990' as form, contractor_comp_990.* into temporary table contractor_comp_990_types from contractor_comp_990 left join org_types on contractor_comp_990.object_id = org_types.object_id and contractor_comp_990.ein = org_types.ein;
48 |
49 |
50 | \copy contractor_comp_990_types to '/data/file_exports/contractors_990.csv' with csv header;
51 |
52 |
53 |
54 | -- 990 PF
55 |
56 | DROP TABLE IF EXISTS contractor_comp_990_pf;
57 |
58 | SELECT
59 | address_table.ein,
60 | address_table.object_id,
61 | address_table."RtrnHdr_TxPrdEndDt",
62 | address_table."RtrnHdr_TxYr",
63 | address_table."BsnssOffcr_SgntrDt",
64 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
65 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
66 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
67 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt",
68 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" ,
69 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" ,
70 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
71 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
72 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
73 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
74 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
75 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
76 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
77 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
78 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
79 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
80 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_PrsnNm" as "CntrctrNm_PrsnNm",
81 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_BsnssNmLn1", ' ', return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_BsnssNmLn2")) as "Cntrctr_Business",
82 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_AddrssLn1Txt", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1",
83 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_AddrssLn2Txt", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2",
84 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_CtyNm", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_CtyNm")) as "Cntrctr_City",
85 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_ZIPCd", ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP",
86 | trim(concat(return_PFCmpnstnOfHghstPdCntrct."USAddrss_SttAbbrvtnCd" , ' ', return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State",
87 | return_PFCmpnstnOfHghstPdCntrct."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd",
88 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_SrvcTxt" as "SrvcsDsc",
89 | return_PFCmpnstnOfHghstPdCntrct."CmpnstnOfHghstPdCntrct_CmpnstnAmt" as "CmpnstnAmt"
90 | INTO TEMPORARY TABLE contractor_comp_990_pf
91 | FROM return_PFCmpnstnOfHghstPdCntrct
92 | LEFT JOIN address_table ON return_PFCmpnstnOfHghstPdCntrct.object_id = address_table.object_id
93 | AND return_PFCmpnstnOfHghstPdCntrct.ein = address_table.ein;
94 |
95 |
96 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990PF' as form, contractor_comp_990_pf.* into temporary table contractor_comp_990_pf_types from contractor_comp_990_pf left join org_types on contractor_comp_990_pf.object_id = org_types.object_id and contractor_comp_990_pf.ein = org_types.ein;
97 |
98 |
99 | \copy contractor_comp_990_pf_types to '/data/file_exports/contractor_comp_990_pf.csv' with csv header;
100 |
101 |
102 | -- 990EZ
103 |
104 | DROP TABLE IF EXISTS contractor_comp_990_ez;
105 |
106 | SELECT
107 | address_table.ein,
108 | address_table.object_id,
109 | address_table."RtrnHdr_TxPrdEndDt",
110 | address_table."RtrnHdr_TxYr",
111 | address_table."BsnssOffcr_SgntrDt",
112 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
113 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
114 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
115 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_ BsnssOffcr_PrsnTtlTxt",
116 | address_table."BsnssOffcr_PhnNm" as "Org_ BsnssOffcr_PhnNm" ,
117 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_ BsnssOffcr_EmlAddrssTxt" ,
118 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
119 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
120 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
121 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
122 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
123 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
124 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
125 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
126 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
127 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
128 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_PrsnNm" as "CntrctrNm_PrsnNm",
129 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_BsnssNmLn1", ' ', return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_BsnssNmLn2")) as "Cntrctr_Business",
130 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_AddrssLn1Txt", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_AddrssLn1Txt")) as "Cntrctr_Address1",
131 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_AddrssLn2Txt", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_AddrssLn2Txt")) as "Cntrctr_Address2",
132 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_CtyNm", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_CtyNm")) as "Cntrctr_City",
133 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_ZIPCd", ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_FrgnPstlCd")) as "Cntrctr_ZIP",
134 | trim(concat(return_EZCmpnstnOfHghstPdCntrct ."USAddrss_SttAbbrvtnCd" , ' ', return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_PrvncOrSttNm")) as "Cntrctr_State",
135 | return_EZCmpnstnOfHghstPdCntrct ."FrgnAddrss_CntryCd" as "Cntrctr_FrgnAddrss_CntryCd",
136 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_SrvcTxt" as "SrvcsDsc",
137 | return_EZCmpnstnOfHghstPdCntrct ."CmpnstnOfHghstPdCntrct_CmpnstnAmt" as "CmpnstnAmt"
138 | INTO TEMPORARY TABLE contractor_comp_990_ez
139 | FROM return_EZCmpnstnOfHghstPdCntrct
140 | LEFT JOIN address_table ON return_EZCmpnstnOfHghstPdCntrct .object_id = address_table.object_id
141 | AND return_EZCmpnstnOfHghstPdCntrct .ein = address_table.ein;
142 |
143 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", url_base, '/IRS990EZ' as form, contractor_comp_990_ez.* into temporary table contractor_comp_990_ez_types from contractor_comp_990_ez left join org_types on contractor_comp_990_ez.object_id = org_types.object_id and contractor_comp_990_ez.ein = org_types.ein;
144 |
145 | \copy contractor_comp_990_ez_types to '/data/file_exports/contractor_comp_990_ez.csv' with csv header;
146 |
147 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 990-xml-database
2 | Django app to consume and store 990 data and metadata. Depends on [IRSx](https://github.com/jsfenfen/990-xml-reader) (which is installed as a dependency below).
3 |
4 | ## Setup and use
5 |
6 | ### Part 1: clone the repo and configure the app
7 |
8 | 1. git clone this repository `git clone https://github.com/jsfenfen/990-xml-database.git` and `$ cd 990-xml-database`
9 |
10 | 2. install the requirements with `pip install -r requirements.txt`. This is Django 2, so only python3 is supported.
11 |
12 | 3. copy the irsdb/local\_settings.py-example file to irsdb\/local_settings.py and edit it to reflect your database settings.
13 |
14 |
15 | ### Part 2: Add the metadata
16 |
17 |
18 | 1. run `python manage.py makemigrations metadata` to generate the metadata migrations, and then run them with `python manage.py migrate metadata`.
19 |
20 | 2. Load the metadata with from source csv files in generated\_schemas with the management command: `python manage.py load_metadata`. This command erases the metadata before loading, so it can be rerun if it somehow breaks in the middle.
21 |
22 | 3. If the csv files have changed you can generate migrations for the db by generating the models with `python manage.py generate_schemas_from_metadata` which puts the new models file in generated\_schemas/ as `django_models_auto.py` and then moving the generated models file into return/models.py and running `python manage.py makemigrations return`.
23 |
24 | ### Part 3: index file data
25 |
26 | The IRS releases metadata files which include the unique id, EIN and other information about each .xml filing. We need to put this in the database to make sense of the raw filings.
27 |
28 | 1. run `python manage.py makemigrations filing` to generate the filing migrations, and then run them with `python manage.py migrate filing`.
29 |
30 | 2. Run `$ python manage.py enter_yearly_submissions ` where YYYY is a the year corresponding to a yearly index file that has already been downloaded. { If it hasn't been downloaded you can retrieve it with irsx_index --year=YYYY }. This script checks to see if the IRS' index file is any bigger than the one one disk, and only runs if it has. You can force it to try to enter any new filings (regardless of whether the file is updated) with the `--enter` option.
31 |
32 | #### Sidebar: 2014 file may need fixing
33 | __There's a problem with the 2014 index file.__ An internal comma has "broken" the .csv format for some time. You can fix it with a perl one liner (which first backs the file up to index_2014.csv.bak before modifying it)
34 |
35 | $ perl -i.bak -p -e 's/SILVERCREST ASSET ,AMAGEMENT/SILVERCREST ASSET MANAGEMENT/g' index_2014.csv
36 |
37 | We can see that it worked by diffing it.
38 |
39 | $ diff index_2014.csv index_2014.csv.bak
40 | 39569c39569
41 | < 11146506,EFILE,136171217,201212,1/14/2014,MOSTYN FOUNDATION INC CO SILVERCREST ASSET MANAGEMENT,990PF,93491211007003,201302119349100700
42 | ---
43 | > 11146506,EFILE,136171217,201212,1/14/2014,MOSTYN FOUNDATION INC CO SILVERCREST ASSET ,AMAGEMENT,990PF,93491211007003,201302119349100700
44 |
45 | For more details see [here](https://github.com/jsfenfen/990-xml-reader/blob/master/2014_is_broken.md).
46 |
47 | ### Part 5: Generate the schema files - Not recommended, this is only used when regenerating models for a new IRSX version
48 |
49 | Run `$ python manage.py generate_schemas_from_metadata` to generate a django models file (to the directory generated_models). You can modify these and put them into return/models.
50 |
51 | ### Part 6. Create the return tables
52 |
53 | Create the tables in the return model by running the migrations.
54 |
55 | `$ python manage.py makemigrations return`
56 | To make the migrations and
57 | `$ python manage.py migrate return`
58 | to run them.
59 |
60 | ### Part 7. Load the filings
61 |
62 | Actually enter the filings into the database with
63 | `$ python manage.py load_filings `.
64 |
65 | This script will take a while to run--probably at least several hours per year. You'll likely want to run it using nohup, so something like this:
66 |
67 |
68 | `$ nohup python manage.py load_filings &`
69 |
70 | Which detaches the terminal from the process, so if your connection times out the command keeps running.
71 |
72 | You may want to adjust your postgres settings for better loading, but you'll need to pay attention to overall memory and resource uses.
73 |
74 | ### Post-loading concerns
75 |
76 |
77 | #### Analyze the load process
78 |
79 | The loading process uses columns in the filing model to track load process (and to insure the same files aren't loaded twice).
80 |
81 | TK - explanation of keyerrors
82 |
83 |
84 | #### Removing all rows
85 |
86 | There's a [sql script](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/sql/delete_all_return.sql) that will remove all entered rows from all return tables and reset the fields in filing as if they were new.
87 |
88 | If you want to live dangerously, you can run it from the console like this:
89 |
90 | `$ python manage.py dbshell < ./return/sql/delete_all_return.sql`
91 |
92 |
93 | #### Adding or removing indexes
94 |
95 | There are management commands to create or drop indexes on object\_id, ein and (for schedule K) documentId. Use
96 | `$ python manage.py make_indexes` or
97 | `$ python manage.py drop_indexes` . These are just conveniences to create indexes named xx_\ --they won't remove other indexes.
98 |
99 | #### Removing a subset of all rows
100 |
101 | You can remove all filings from a given index file with the [remove_year](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/management/commands/remove_year.py). It's likely to run faster if indexes are in place.
102 |
103 | #### Removing only the rows that were half loaded
104 |
105 | If loading gets interrupted, you can remove only the rows where parse\_started is true and parse\_complete is not with the management command [remove\_half\_loaded](https://github.com/jsfenfen/990-xml-database/blob/master/irsdb/return/management/commands/remove_half_loaded.py). It also requires a year as a command line argument.
106 |
107 | `$ python manage.py remove_half_loaded 2018`
108 |
109 | #### File size concerns
110 |
111 | The full download of uncompressed .xml files is over ~74 gigabytes. Processing a complete year of data probably entails moving at least 15 gigs of xml.
112 |
113 | You probably want to look into a tool to help you move these files in bulk. AWS' S3 CLI can dramatically reduce download time, but seems unhelpful when trying to pull a subset of files (it seems like [--exclude '*'](https://docs.aws.amazon.com/cli/latest/reference/s3/index.html#use-of-exclude-and-include-filters) hangs when processing so many files). You may want to look into moving all the files to your own S3 bucket as well. There are also alternatives to AWS' CLI tool, like [S3 CMD](http://s3tools.org/s3cmd).
114 |
115 | You'll also want to [configure IRSx file cache directory](https://github.com/jsfenfen/990-xml-reader/#configuring-the-file-cache-directory) to set the WORKING_DIRECTORY variable to the file path of the folder where the xml files are located.
116 |
117 | The worst option is to download the uncompressed files one at a time. That sounds, really, really slow.
118 |
119 |
120 | #### Server considerations
121 |
122 | With most hosting providers, you'll need to configure additional storage to support the static files and the database that's ultimately loaded. Make sure that you set the database storage directory to *that storage*, and get the fastest storage type you can afford.
123 |
124 | You may want to look into tuning your database parameters to better support data loading. And you'll get better performance if you only create indexes after loading is complete (and delete them before bulk loads take place).
125 |
126 | One random datapoint: on an Amazon t2.medium ec2 server (~$38/month) with 150 gigs of additional storage and postgres running on the default configs and writing to an SSD EBS volume, load time for the complete set of about 490,000 filings from 2017 took about 3 hours.
127 |
128 | #### Monthly load
129 |
130 | This assumes no schema changes are required, which is usually the case.
131 |
132 | Run an S3 sync to the location of the fillings. The whole collection is now over 80 GB, make sure you have room. You can also retrieve some other way (if you don't retrieve en masse the load_filings.py script will attempt to download one filing at a time). It's useful to run this with nohup, i.e.
133 |
134 | nohup aws s3 sync s3://irs-form-990/ ./ &
135 |
136 | Then update the index file data
137 |
138 | $ python manage.py enter_yearly_submissions 2018
139 |
140 |
141 | index_2018.csv has changed. Downloading updated file...
142 | Done!
143 | Entering xml submissions from /home/webuser/virt/env/lib/python3.5/site-packages/irsx/CSV/index_2018.csv
144 |
145 | Committing 10000 total entered=10000
146 | commit complete
147 | Committing 10000 total entered=20000
148 | commit complete
149 | Added 24043 new entries.
150 |
151 | Then enter the filings into the relational database with:
152 |
153 | $ python manage.py load_filings 2018
154 |
155 | Running filings during year 2018
156 | Processed a total of 100 filings
157 | Processed a total of 200 filings
158 | Processed a total of 300 filings
159 |
160 | ...
161 |
162 | Handled 24000 filings
163 | Processed a total of 24000 filings
164 | Processed a total of 24043 filings
165 | Done
166 |
167 | This script finds filings where `submission_year` is the entered year and `parse_complete` has not been set to True. It enters them in groups of 100 and sets `parse_complete` to True after each batch has completed. The script is fairly fault tolerant, but if it dies in the middle it's important to remove all the half entered filings where `parse_started` = True and `parse_complete` is not True. (By default it is null, so don't try to match on `parse_complete` = False).
168 |
169 |
170 | --
--------------------------------------------------------------------------------
/sked_l.sh:
--------------------------------------------------------------------------------
1 |
2 | -- Schedule L - Transactions with interested parties
3 |
4 | -- Part I: Excess Benefit Transactions
5 | -- See the repeating group docs [here](http://www.irsx.info/metadata/groups/SkdLDsqlfdPrsnExBnftTr.html)
6 |
7 |
8 |
9 | DROP TABLE IF EXISTS excess_benefits;
10 |
11 | SELECT
12 | address_table."RtrnHdr_TxPrdEndDt",
13 | address_table."RtrnHdr_TxYr",
14 | address_table."BsnssOffcr_SgntrDt",
15 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
16 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
17 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
18 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt",
19 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" ,
20 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" ,
21 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
22 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
23 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
24 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
25 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
26 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
27 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
28 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
29 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
30 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
31 | return_SkdLDsqlfdPrsnExBnftTr.*
32 | INTO TEMPORARY TABLE excess_benefits
33 | FROM return_SkdLDsqlfdPrsnExBnftTr
34 | LEFT JOIN address_table ON return_SkdLDsqlfdPrsnExBnftTr.object_id = address_table.object_id
35 | AND return_SkdLDsqlfdPrsnExBnftTr.ein = address_table.ein;
36 |
37 |
38 | DROP TABLE IF EXISTS excess_benefits_types;
39 |
40 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, excess_benefits.* into TEMPORARY TABLE excess_benefits_types from excess_benefits left join org_types on excess_benefits.object_id = org_types.object_id;
41 |
42 | \copy excess_benefits_types to '/data/file_exports/excess_benefits.csv' with csv header;
43 |
44 |
45 |
46 |
47 | -- Part II: Loans Between the Organization and Interested Persons
48 |
49 | -- Loans from the org to an insider
50 | -- See the repeating group docs [here](http://www.irsx.info/metadata/groups/SkdLLnsBtwnOrgIntrstdPrsn.html)
51 |
52 |
53 |
54 | DROP TABLE IF EXISTS loans_from;
55 |
56 | SELECT
57 | address_table."RtrnHdr_TxPrdEndDt",
58 | address_table."RtrnHdr_TxYr",
59 | address_table."BsnssOffcr_SgntrDt",
60 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
61 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
62 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
63 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt",
64 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" ,
65 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" ,
66 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
67 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
68 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
69 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
70 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
71 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
72 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
73 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
74 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
75 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
76 | return_SkdLLnsBtwnOrgIntrstdPrsn.*
77 |
78 | INTO TEMPORARY TABLE loans_from
79 | FROM return_SkdLLnsBtwnOrgIntrstdPrsn
80 | LEFT JOIN address_table ON return_SkdLLnsBtwnOrgIntrstdPrsn.object_id = address_table.object_id
81 | AND return_SkdLLnsBtwnOrgIntrstdPrsn.ein = address_table.ein
82 | WHERE return_SkdLLnsBtwnOrgIntrstdPrsn."LnFrmOrgnztnInd" = 'X';
83 |
84 |
85 | drop table if exists loans_from_types;
86 |
87 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, loans_from.* into temporary table loans_from_types from loans_from left join org_types on loans_from.object_id = org_types.object_id and loans_from.ein = org_types.ein;
88 |
89 |
90 | \copy loans_from_types to '/data/file_exports/loans_from.csv' with csv header;
91 |
92 |
93 |
94 | -- Loans from an insider to the org
95 |
96 |
97 |
98 | DROP TABLE IF EXISTS loans_to;
99 |
100 | SELECT
101 | address_table."RtrnHdr_TxPrdEndDt",
102 | address_table."RtrnHdr_TxYr",
103 | address_table."BsnssOffcr_SgntrDt",
104 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
105 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
106 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
107 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt",
108 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" ,
109 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" ,
110 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
111 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
112 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
113 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
114 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
115 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
116 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
117 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
118 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
119 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
120 | return_SkdLLnsBtwnOrgIntrstdPrsn.*
121 | INTO TEMPORARY TABLE loans_to
122 | FROM return_SkdLLnsBtwnOrgIntrstdPrsn
123 | LEFT JOIN address_table ON return_SkdLLnsBtwnOrgIntrstdPrsn.object_id = address_table.object_id
124 | AND return_SkdLLnsBtwnOrgIntrstdPrsn.ein = address_table.ein
125 | WHERE return_SkdLLnsBtwnOrgIntrstdPrsn."LnTOrgnztnInd" = 'X';
126 |
127 | drop table if exists loans_to_types;
128 |
129 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, loans_to.* into TEMPORARY TABLE loans_to_types from loans_to left join org_types on loans_to.object_id = org_types.object_id and loans_to.ein = org_types.ein;
130 |
131 |
132 | \copy loans_to_types to '/data/file_exports/loans_to.csv' with csv header;
133 |
134 |
135 | -- Part III: Grants or Assistance Benefiting Interested Persons
136 |
137 | -- http://www.irsx.info/metadata/groups/SkdLGrntAsstBnftIntrstdPrsn.html
138 |
139 | DROP TABLE IF EXISTS insider_assistance;
140 |
141 | SELECT
142 | address_table."RtrnHdr_TxPrdEndDt",
143 | address_table."RtrnHdr_TxYr",
144 | address_table."BsnssOffcr_SgntrDt",
145 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
146 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
147 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
148 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt",
149 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" ,
150 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" ,
151 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
152 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
153 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
154 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
155 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
156 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
157 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
158 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
159 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
160 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
161 | return_SkdLGrntAsstBnftIntrstdPrsn.*
162 | INTO TEMPORARY TABLE insider_assistance
163 | FROM return_SkdLGrntAsstBnftIntrstdPrsn
164 | LEFT JOIN address_table ON return_SkdLGrntAsstBnftIntrstdPrsn.object_id = address_table.object_id
165 | AND return_SkdLGrntAsstBnftIntrstdPrsn.ein = address_table.ein
166 |
167 |
168 | drop table if exists insider_assistance_types;
169 |
170 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, insider_assistance.* into temporary table insider_assistance_types from insider_assistance left join org_types on insider_assistance.object_id = org_types.object_id and insider_assistance.ein = org_types.ein;
171 |
172 |
173 | \copy insider_assistance_types to '/data/file_exports/insider_assistance.csv' with csv header;
174 |
175 |
176 |
177 | -- Part IV: Business Transactions Involving Interested Persons
178 |
179 |
180 | -- http://www.irsx.info/metadata/groups/SkdLBsTrInvlvIntrstdPrsn.html
181 |
182 | DROP TABLE IF EXISTS insider_transactions;
183 |
184 | SELECT
185 | address_table."RtrnHdr_TxPrdEndDt",
186 | address_table."RtrnHdr_TxYr",
187 | address_table."BsnssOffcr_SgntrDt",
188 | address_table."BsnssNm_BsnssNmLn1Txt" as "Org_BsnssNmLn1",
189 | address_table."BsnssNm_BsnssNmLn2Txt" as "Org_BsnssNmL21",
190 | address_table."BsnssOffcr_PrsnNm" as "Org_BsnssOffcr_PrsnNm",
191 | address_table."BsnssOffcr_PrsnTtlTxt" as "Org_BsnssOffcr_PrsnTtlTxt",
192 | address_table."BsnssOffcr_PhnNm" as "Org_BsnssOffcr_PhnNm" ,
193 | address_table."BsnssOffcr_EmlAddrssTxt" as "Org_BsnssOffcr_EmlAddrssTxt" ,
194 | address_table."USAddrss_AddrssLn1Txt" as "Org_AddrssLn1Txt",
195 | address_table."USAddrss_AddrssLn2Txt" as "Org_AddrssLn2Txt",
196 | address_table."USAddrss_CtyNm" as "Org_CtyNm",
197 | address_table."USAddrss_SttAbbrvtnCd" as "Org_SttAbbrvtnCd",
198 | address_table."USAddrss_ZIPCd" as "Org_ZIPCd",
199 | address_table."FrgnAddrss_AddrssLn1Txt" as "Org_FrgnAddrss_AddrssLn1Txt",
200 | address_table."FrgnAddrss_AddrssLn2Txt" as "Org_FrgnAddrss_AddrssLn2Txt",
201 | address_table."FrgnAddrss_CtyNm" as "Org_FrgnAddrss_CtyNm",
202 | address_table."FrgnAddrss_PrvncOrSttNm" as "Org_PrvncOrSttNm",
203 | address_table."FrgnAddrss_CntryCd" as "Org_CntryCd",
204 | return_SkdLBsTrInvlvIntrstdPrsn.*
205 | INTO TEMPORARY TABLE insider_transactions
206 | FROM return_SkdLBsTrInvlvIntrstdPrsn
207 | LEFT JOIN address_table ON return_SkdLBsTrInvlvIntrstdPrsn.object_id = address_table.object_id
208 | AND return_SkdLBsTrInvlvIntrstdPrsn.ein = address_table.ein;
209 |
210 | drop table if exists insider_transactions_types;
211 |
212 | select "Orgnztn501c3Ind", "Orgnztn501cInd", "Orgnztn49471NtPFInd", "Orgnztn527Ind", concat(org_types.ein, '/', org_types.object_id) as url_base, '/IRS990ScheduleL' as form, insider_transactions.* into temporary table insider_transactions_types from insider_transactions left join org_types on insider_transactions.object_id = org_types.object_id and insider_transactions.ein = org_types.ein;
213 |
214 |
215 | \copy insider_transactions_types to '/data/file_exports/insider_transactions.csv' with csv header;
216 |
217 |
218 |
219 |
--------------------------------------------------------------------------------
/irsdb/dump_from_manifest.py:
--------------------------------------------------------------------------------
1 |
2 | import unicodecsv as csv
3 | from irsx.xmlrunner import XMLRunner
4 |
5 | from irsx.filing import FileMissingException
6 | from stream_extractor import StreamExtractor
7 |
8 |
9 |
10 | output_streams = {
11 |
12 | '990_part_0': {
13 | 'filename':'990_part_0',
14 | 'headers': ["ein", "object_id", 'Orgnztn527Ind', 'Orgnztn501cInd', 'Orgnztn49471NtPFInd', 'Orgnztn501c3Ind', 'WbstAddrssTxt', 'OfOrgnztnTrstInd', 'OthrOrgnztnDsc', 'OfOrgnztnCrpInd', 'OfOrgnztnOthrInd', 'OfOrgnztnAsscInd', 'FrmtnYr', 'LglDmclSttCd', 'LglDmclCntryCd']
15 | },
16 | '990_part_i': {
17 | 'filename':'990_part_i',
18 | 'headers': ["ein", "object_id", "CntrctTrmntnInd", "TtlEmplyCnt", "TtlVlntrsCnt", "CYInvstmntIncmAmt", "CYTtlRvnAmt", "CYTtlExpnssAmt", "CYRvnsLssExpnssAmt", "TtlAsstsEOYAmt", "ActvtyOrMssnDsc" ]
19 | },
20 | '990_part_iv': {
21 | 'filename':'990_part_iv',
22 | 'headers': ["ein", "object_id", "PrtlLqdtnInd"]
23 | },
24 | '990ez_part_0': {
25 | 'filename':'990ez_part_0',
26 | 'headers': ["ein", "object_id", "WbstAddrssTxt", "Orgnztn527Ind", "Orgnztn501c3Ind", "Orgnztn49471NtPFInd", "Orgnztn501cInd", "OfOrgnztnOthrDsc", "OfOrgnztnOthrInd", "OfOrgnztnCrpInd", "OfOrgnztnTrstInd", "OfOrgnztnAsscInd", "GrssRcptsAmt"]
27 | },
28 | '990ez_part_i': {
29 | 'filename':'990ez_part_i',
30 | 'headers': ["ein", "object_id", "TtlExpnssAmt", "TtlRvnAmt"]
31 | },
32 | '990pf_part_0': {
33 | 'filename':'990pf_part_0',
34 | 'headers': ["ein", "object_id","PFSttsTrmSct507b1AInd", "Orgnztn501c3TxblPFInd", "Orgnztn501c3ExmptPFInd", "Orgnztn49471TrtdPFInd", "FMVAsstsEOYAmt"]
35 | },
36 | '990pf_part_i': {
37 | 'filename':'990pf_part_i',
38 | 'headers': ["ein", "object_id", 'TtlRvAndExpnssAmt', 'CmpOfcrDrTrstRvAndExpnssAmt', 'OthEmplSlrsWgsRvAndExpnssAmt', 'TtOprExpnssRvAndExpnssAmt', 'CntrPdRvAndExpnssAmt', 'TtlExpnssRvAndExpnssAmt']
39 | },
40 | '990pf_part_viia': {
41 | 'filename':'990pf_part_viia',
42 | 'headers': ["ein", "object_id", "SttmntsRgrdngActy_WbstAddrssTxt"]
43 | },
44 | 'employees_990': {
45 | 'filename':'employees_990', # will output to employees_detailedYYYY.csv where year is specified below
46 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"]
47 | },
48 | 'employees_990PF': {
49 | 'filename':'employees_990PF', # will output to employees_detailedYYYY.csv where year is specified below
50 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"]
51 | },
52 | 'employees_990EZ': {
53 | 'filename':'employees_990EZ', # will output to employees_detailedYYYY.csv where year is specified below
54 | 'headers':["ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"]
55 | },
56 | 'header_metadata': {
57 | 'filename':'header_metadata', # will output to employees_detailedYYYY.csv where year is specified below
58 | 'headers':["ein", "object_id", "BsnssNm_BsnssNmLn1Txt", "BsnssNm_BsnssNmLn2Txt", "USAddrss_AddrssLn1Txt", "USAddrss_AddrssLn2Txt", "USAddrss_CtyNm", "USAddrss_SttAbbrvtnCd", "RtrnHdr_TxPrdBgnDt", "RtrnHdr_TxPrdEndDt", "BsnssOffcr_SgntrDt", "Flr_PhnNm", "RtrnHdr_RtrnTs"]
59 | }
60 | ,
61 | '990L_loans': {
62 | 'filename':'990L_loans', # will output to employees_detailedYYYY.csv where year is specified below
63 | 'headers':[ 'ein', 'object_id', 'BsnssNmLn1Txt', 'BsnssNmLn2Txt', 'PrsnNm', 'RltnshpWthOrgTxt', 'LnPrpsTxt', 'LnFrmOrgnztnInd', 'LnTOrgnztnInd', 'OrgnlPrncplAmt', 'BlncDAmt', 'DfltInd', 'BrdOrCmmttApprvlInd', 'WrttnAgrmntInd']
64 | }
65 | ,
66 | '990L_grants': {
67 | 'filename':'990L_grants', # will output to employees_detailedYYYY.csv where year is specified below
68 | 'headers':["ein", "object_id", "PrsnNm", "BsnssNmLn1Txt", "BsnssNmLn2Txt", "RltnshpWthOrgTxt", "CshGrntAmt", "OfAssstncTxt", "AssstncPrpsTxt"]
69 | },
70 | '990L_trans': {
71 | 'filename':'990L_trans', # will output to employees_detailedYYYY.csv where year is specified below
72 | 'headers':["ein", "object_id", "BsnssNmLn1Txt", "PrsnNm", "BsnssNmLn2Txt", "RltnshpDscrptnTxt", "TrnsctnAmt", "TrnsctnDsc", "ShrngOfRvnsInd"]
73 | }
74 | # 'diversions': {
75 | # 'filename':'diversions', # will output to diversionsYYYY.csv where year is specified below
76 | # 'headers':["year", "ein", "object_id", "taxpayer_name", "diversion_ind"]
77 | # }
78 | }
79 |
80 | data_capture_dict = {
81 | 'IRS990': {
82 | 'parts': {
83 | 'part_0': {
84 | 'stream_key': '990_part_0', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
85 | 'ein': {'header':'ein'},
86 | 'object_id': {'header':'object_id'},
87 | 'Orgnztn527Ind':{'header':'Orgnztn527Ind'},
88 | 'Orgnztn501cInd':{'header':'Orgnztn501cInd'},
89 | 'Orgnztn49471NtPFInd':{'header':'Orgnztn49471NtPFInd'},
90 | 'Orgnztn501c3Ind' :{'header':'Orgnztn501c3Ind'},
91 | 'WbstAddrssTxt' :{'header':'WbstAddrssTxt'},
92 | 'OfOrgnztnTrstInd' :{'header':'OfOrgnztnTrstInd'},
93 | 'OthrOrgnztnDsc' :{'header':'OthrOrgnztnDsc'},
94 | 'OfOrgnztnCrpInd' :{'header':'OfOrgnztnCrpInd'},
95 | 'OfOrgnztnOthrInd' :{'header':'OfOrgnztnOthrInd'},
96 | 'OfOrgnztnAsscInd' :{'header':'OfOrgnztnAsscInd'},
97 | 'FrmtnYr' :{'header':'FrmtnYr'},
98 | 'LglDmclSttCd' :{'header':'LglDmclSttCd'},
99 | 'LglDmclCntryCd' :{'header':'LglDmclCntryCd'},
100 | },
101 | 'part_i': {
102 | 'stream_key': '990_part_i',
103 | 'ein': {'header':'ein'},
104 | 'object_id': {'header':'object_id'},
105 | 'CntrctTrmntnInd': {'header': "CntrctTrmntnInd"},
106 | 'ActvtyOrMssnDsc': {'header': "ActvtyOrMssnDsc"},
107 | 'TtlEmplyCnt': {'header': "TtlEmplyCnt"},
108 | 'TtlVlntrsCnt': {'header': "TtlVlntrsCnt"},
109 | 'CYInvstmntIncmAmt': {'header': "CYInvstmntIncmAmt"},
110 | 'CYTtlRvnAmt': {'header': "CYTtlRvnAmt"},
111 | 'CYTtlExpnssAmt': {'header': "CYTtlExpnssAmt"},
112 | 'CYRvnsLssExpnssAmt': {'header': "CYRvnsLssExpnssAmt"},
113 | 'TtlAsstsEOYAmt': {'header': "TtlAsstsEOYAmt"}
114 | },
115 | 'part_iv': {
116 | "stream_key": '990_part_iv',
117 | "ein": {'header': "ein"},
118 | "object_id": {'header': "object_id"},
119 | "PrtlLqdtnInd": {'header': "PrtlLqdtnInd"}
120 | }
121 | },
122 | ## The remaining logic is for capturing salaries wherever they appear in
123 | ## the 990, 990PF and 990EZ
124 | 'groups': {
125 | 'Frm990PrtVIISctnA': {
126 | 'stream_key': 'employees_990', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
127 | 'ein': {'header':'ein'},
128 | 'object_id': {'header':'object_id'},
129 | 'PrsnNm': {'header':'name'},
130 | 'BsnssNmLn1Txt':{'header':'business_name1'},
131 | 'BsnssNmLn2Txt':{'header':'business_name2'},
132 | 'TtlTxt': {'header':'title'},
133 | 'RprtblCmpFrmOrgAmt': {
134 | 'header':'org_comp',
135 | 'default':0 # set numeric if missing
136 | },
137 | 'RprtblCmpFrmRltdOrgAmt': {
138 | 'header':'related_comp',
139 | 'default':0
140 | },
141 | 'OthrCmpnstnAmt':{
142 | 'header':'other_cmp',
143 | 'default':0
144 | },
145 | 'IndvdlTrstOrDrctrInd':{'header':'IndvdlTrstOrDrctrInd'},
146 | 'InstttnlTrstInd':{'header':'InstttnlTrstInd'},
147 | 'OffcrInd':{'header':'OffcrInd'},
148 | 'KyEmplyInd':{'header':'KyEmplyInd'},
149 | 'HghstCmpnstdEmplyInd':{'header':'HghstCmpnstdEmplyInd'},
150 | 'FrmrOfcrDrctrTrstInd':{'header':'FrmrOfcrDrctrTrstInd'}
151 | }
152 | }
153 | },
154 | 'IRS990EZ': {
155 | 'parts': {
156 | 'ez_part_0':{
157 | 'stream_key': '990ez_part_0',
158 | 'ein': {'header':'ein'},
159 | 'object_id': {'header':'object_id'},
160 | "WbstAddrssTxt": {'header':'WbstAddrssTxt'},
161 | "Orgnztn527Ind": {'header':'Orgnztn527Ind'},
162 | "Orgnztn501c3Ind": {'header':'Orgnztn501c3Ind'},
163 | "Orgnztn49471NtPFInd": {'header':'Orgnztn49471NtPFInd'},
164 | "Orgnztn501cInd": {'header':'Orgnztn501cInd'},
165 | "OfOrgnztnOthrDsc": {'header':'OfOrgnztnOthrDsc'},
166 | "OfOrgnztnOthrInd": {'header':'OfOrgnztnOthrInd'},
167 | "OfOrgnztnCrpInd": {'header':'OfOrgnztnCrpInd'},
168 | "OfOrgnztnTrstInd": {'header':'OfOrgnztnTrstInd'},
169 | "OfOrgnztnAsscInd": {'header':'OfOrgnztnAsscInd'},
170 | "GrssRcptsAmt": {'header':'GrssRcptsAmt'}
171 | },
172 | 'ez_part_i': {
173 | 'stream_key': '990ez_part_i',
174 | 'ein': {'header':'ein'},
175 | 'object_id': {'header':'object_id'},
176 | "TtlExpnssAmt": {'header':'TtlExpnssAmt'},
177 | "TtlRvnAmt": {'header':'TtlRvnAmt'},
178 | }
179 | },
180 | 'groups': {
181 | 'EZOffcrDrctrTrstEmpl': {
182 | 'stream_key': 'employees_990EZ',
183 | 'ein': {'header':'ein'},
184 | 'object_id': {'header':'object_id'},
185 | 'PrsnNm': {'header':'name'},
186 | 'BsnssNmLn1': {'header':'business_name1'},
187 | 'BsnssNmLn2': {'header':'business_name2'},
188 |
189 |
190 | 'TtlTxt': {'header':'title'},
191 | 'CmpnstnAmt': {
192 | 'header':'org_comp',
193 | 'default':0
194 | },
195 | 'composite': { # other compensation includes benefits and other allowances for EZ, PF filers
196 | 'other_cmp': {
197 | 'EmplyBnftPrgrmAmt': {
198 | 'default':0
199 | },
200 | 'ExpnsAccntOthrAllwncAmt': {
201 | 'default':0
202 | }
203 | }
204 | }
205 | },
206 | 'EZCmpnstnHghstPdEmpl': {
207 | 'stream_key': 'employees_990EZ',
208 | 'ein': {'header':'ein'},
209 | 'object_id': {'header':'object_id'},
210 | 'PrsnNm': {'header':'name'},
211 | 'TtlTxt': {'header':'title'},
212 | 'CmpnstnAmt': {
213 | 'header':'org_comp',
214 | 'default':0
215 | },
216 | 'composite': {
217 | 'other_cmp': {
218 | 'EmplyBnftsAmt': {
219 | 'default':0
220 | },
221 | 'ExpnsAccntAmt': {
222 | 'default':0
223 | }
224 | }
225 | }
226 | }
227 | }
228 | },
229 | 'ReturnHeader990x': {
230 | 'parts': {
231 | 'returnheader990x_part_i': {
232 | 'stream_key': 'header_metadata', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
233 | 'ein': {'header':'ein'},
234 | 'object_id': {'header':'object_id'},
235 | 'RtrnHdr_TxYr':{'header':'RtrnHdr_TxYr'},
236 | 'BsnssNm_BsnssNmLn2Txt': {'header':'BsnssNm_BsnssNmLn2Txt'},
237 | 'BsnssNm_BsnssNmLn1Txt': {'header':'BsnssNm_BsnssNmLn1Txt'},
238 | 'USAddrss_AddrssLn1Txt': {'header':'USAddrss_AddrssLn1Txt'},
239 | 'USAddrss_AddrssLn2Txt': {'header':'USAddrss_AddrssLn2Txt'},
240 | 'USAddrss_CtyNm': {'header':'USAddrss_CtyNm'},
241 | 'USAddrss_SttAbbrvtnCd': {'header':'USAddrss_SttAbbrvtnCd'},
242 | 'RtrnHdr_TxPrdBgnDt': {'header':'RtrnHdr_TxPrdBgnDt'},
243 | 'RtrnHdr_TxPrdEndDt': {'header':'RtrnHdr_TxPrdEndDt'},
244 | 'BsnssOffcr_SgntrDt': {'header': 'BsnssOffcr_SgntrDt'},
245 | 'Flr_PhnNm': {'header': 'Flr_PhnNm'},
246 | 'RtrnHdr_RtrnTs': {'header': 'RtrnHdr_RtrnTs'}
247 | }
248 | }
249 | },
250 | 'IRS990ScheduleL': {
251 | 'parts': {
252 | },
253 | 'groups': {
254 | 'SkdLLnsBtwnOrgIntrstdPrsn': {
255 | 'stream_key': '990L_loans',
256 | 'ein': {'header':'ein'},
257 | 'object_id': {'header':'object_id'},
258 | 'BsnssNmLn1Txt': {'header':'BsnssNmLn1Txt'},
259 | 'BsnssNmLn2Txt': {'header':'BsnssNmLn2Txt'},
260 | 'PrsnNm': {'header':'PrsnNm'},
261 | 'RltnshpWthOrgTxt': {'header':'RltnshpWthOrgTxt'},
262 | 'LnPrpsTxt': {'header':'LnPrpsTxt'},
263 | 'LnFrmOrgnztnInd': {'header':'LnFrmOrgnztnInd'},
264 | 'LnTOrgnztnInd': {'header':'LnTOrgnztnInd'},
265 | 'OrgnlPrncplAmt': {'header':'OrgnlPrncplAmt'},
266 | 'BlncDAmt': {'header':'BlncDAmt'},
267 | 'DfltInd': {'header':'DfltInd'},
268 | 'BrdOrCmmttApprvlInd': {'header':'BrdOrCmmttApprvlInd'},
269 | 'WrttnAgrmntInd': {'header':'WrttnAgrmntInd'}
270 | },
271 | 'SkdLGrntAsstBnftIntrstdPrsn': {
272 | 'stream_key': '990L_grants',
273 | 'ein': {'header':'ein'},
274 | 'object_id': {'header':'object_id'},
275 | "PrsnNm": {'header':'PrsnNm'},
276 | "BsnssNmLn1Txt": {'header':'BsnssNmLn1Txt'},
277 | "BsnssNmLn2Txt": {'header':'BsnssNmLn2Txt'},
278 | "RltnshpWthOrgTxt": {'header':'RltnshpWthOrgTxt'},
279 | "CshGrntAmt": {'header':'CshGrntAmt'},
280 | "OfAssstncTxt": {'header':'OfAssstncTxt'},
281 | "AssstncPrpsTxt": {'header':'AssstncPrpsTxt'},
282 | },
283 | 'SkdLBsTrInvlvIntrstdPrsn': {
284 | 'stream_key': '990L_trans',
285 | 'ein': {'header':'ein'},
286 | 'object_id': {'header':'object_id'},
287 | "BsnssNmLn1Txt": {'header':'BsnssNmLn1Txt'},
288 | "PrsnNm": {'header':'PrsnNm'},
289 | "BsnssNmLn2Txt": {'header':'BsnssNmLn2Txt'},
290 | "RltnshpDscrptnTxt": {'header':'RltnshpDscrptnTxt'},
291 | "TrnsctnAmt": {'header':'TrnsctnAmt'},
292 | "TrnsctnDsc": {'header':'TrnsctnDsc'},
293 | "ShrngOfRvnsInd": {'header':'ShrngOfRvnsInd'}
294 | }
295 | }
296 | },
297 |
298 | 'IRS990PF': {
299 | 'parts': {
300 | 'pf_part_0': {
301 | 'stream_key': '990pf_part_0', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
302 | 'ein': {'header':'ein'},
303 | 'object_id': {'header':'object_id'},
304 | "PFSttsTrmSct507b1AInd": {'header':'PFSttsTrmSct507b1AInd'},
305 | "Orgnztn501c3TxblPFInd": {'header':'Orgnztn501c3TxblPFInd'},
306 | "Orgnztn501c3ExmptPFInd": {'header':'Orgnztn501c3ExmptPFInd'},
307 | "Orgnztn49471TrtdPFInd": {'header':'Orgnztn49471TrtdPFInd'},
308 | "FMVAsstsEOYAmt": {'header':'FMVAsstsEOYAmt'},
309 | },
310 | 'pf_part_i': {
311 | 'stream_key': '990pf_part_i', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
312 | 'ein': {'header':'ein'},
313 | 'object_id': {'header':'object_id'},
314 | 'TtlRvAndExpnssAmt': {'header':'TtlRvAndExpnssAmt'},
315 | 'CmpOfcrDrTrstRvAndExpnssAmt': {'header':'CmpOfcrDrTrstRvAndExpnssAmt'},
316 | 'OthEmplSlrsWgsRvAndExpnssAmt': {'header':'OthEmplSlrsWgsRvAndExpnssAmt'},
317 | 'TtOprExpnssRvAndExpnssAmt': {'header':'TtOprExpnssRvAndExpnssAmt'},
318 | 'CntrPdRvAndExpnssAmt': {'header':'CntrPdRvAndExpnssAmt'},
319 | 'TtlExpnssRvAndExpnssAmt': {'header':'TtlExpnssRvAndExpnssAmt'}
320 | },
321 | 'pf_part_viia': {
322 | 'stream_key': '990pf_part_viia', # 'stream_key' specifies where the output goes--must exist as a key in output_streams
323 | 'ein': {'header':'ein'},
324 | 'object_id': {'header':'object_id'},
325 | 'SttmntsRgrdngActy_WbstAddrssTxt': {'header':'SttmntsRgrdngActy_WbstAddrssTxt'}
326 | }
327 | },
328 | 'groups': {
329 | 'PFOffcrDrTrstKyEmpl': {
330 | 'stream_key': 'employees_990PF',
331 |
332 | 'ein': {'header':'ein'},
333 | 'object_id': {'header':'object_id'},
334 | 'OffcrDrTrstKyEmpl_PrsnNm': {'header':'name'},
335 | 'OffcrDrTrstKyEmpl_BsnssNmLn1': {'header':'business_name1'},
336 | 'OffcrDrTrstKyEmpl_BsnssNmLn2': {'header':'business_name2'},
337 | 'OffcrDrTrstKyEmpl_TtlTxt': {'header':'title'},
338 | 'OffcrDrTrstKyEmpl_CmpnstnAmt': {
339 | 'header':'org_comp',
340 | 'default':0 # set numeric if missing
341 | },
342 | 'composite': {
343 | 'other_cmp': {
344 | 'OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt': {
345 | 'default':0
346 | },
347 | 'OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt': {
348 | 'default':0
349 | }
350 | }
351 | }
352 | },
353 | 'PFCmpnstnHghstPdEmpl': {
354 | 'stream_key': 'employees_990PF',
355 |
356 | 'ein': {'header':'ein'},
357 | 'object_id': {'header':'object_id'},
358 | 'CmpnstnHghstPdEmpl_PrsnNm': {'header':'name'},
359 | 'CmpnstnHghstPdEmpl_TtlTxt': {'header':'title'},
360 | 'CmpnstnHghstPdEmpl_CmpnstnAmt': {
361 | 'header':'org_comp',
362 | 'default':0 # set numeric if missing
363 | },
364 | 'composite': {
365 | 'other_cmp': {
366 | 'CmpnstnHghstPdEmpl_EmplyBnftsAmt': {
367 | 'default':0
368 | },
369 | 'CmpnstnHghstPdEmpl_ExpnsAccntAmt': {
370 | 'default':0
371 | }
372 | }
373 | }
374 | }
375 | }
376 | }
377 | }
378 |
379 | if __name__ == '__main__':
380 |
381 | input_file = "initial_manifest.csv"
382 |
383 |
384 |
385 | # read the whole file in here, it's not very long
386 | file_rows = []
387 |
388 | # We're using the output of part 1
389 | with open(input_file, 'rb') as infile:
390 | reader = csv.DictReader(infile)
391 | for row in reader:
392 | file_rows.append(row)
393 |
394 |
395 | extractor = StreamExtractor(output_streams, data_capture_dict)
396 |
397 |
398 | filing_count = 0
399 | for metadata_row in file_rows:
400 |
401 | try:
402 | object_id = metadata_row['object_id']
403 | if object_id:
404 | #print("Running %s " % metadata_row['object_id'])
405 | extractor.run_filing(object_id, taxpayer_name=metadata_row['name'])
406 |
407 | filing_count += 1
408 | if filing_count % 100 == 0:
409 | print("Processed %s filings" % filing_count)
410 |
411 |
412 | except FileMissingException:
413 | pass
414 | #print("Missing %s skipping " % metadata_row['object_id'])
415 |
416 |
417 |
--------------------------------------------------------------------------------
/irsdb/static/js/bootstrap.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap v3.3.7 (http://getbootstrap.com)
3 | * Copyright 2011-2016 Twitter, Inc.
4 | * Licensed under the MIT license
5 | */
6 | if("undefined"==typeof jQuery)throw new Error("Bootstrap's JavaScript requires jQuery");+function(a){"use strict";var b=a.fn.jquery.split(" ")[0].split(".");if(b[0]<2&&b[1]<9||1==b[0]&&9==b[1]&&b[2]<1||b[0]>3)throw new Error("Bootstrap's JavaScript requires jQuery version 1.9.1 or higher, but lower than version 4")}(jQuery),+function(a){"use strict";function b(){var a=document.createElement("bootstrap"),b={WebkitTransition:"webkitTransitionEnd",MozTransition:"transitionend",OTransition:"oTransitionEnd otransitionend",transition:"transitionend"};for(var c in b)if(void 0!==a.style[c])return{end:b[c]};return!1}a.fn.emulateTransitionEnd=function(b){var c=!1,d=this;a(this).one("bsTransitionEnd",function(){c=!0});var e=function(){c||a(d).trigger(a.support.transition.end)};return setTimeout(e,b),this},a(function(){a.support.transition=b(),a.support.transition&&(a.event.special.bsTransitionEnd={bindType:a.support.transition.end,delegateType:a.support.transition.end,handle:function(b){if(a(b.target).is(this))return b.handleObj.handler.apply(this,arguments)}})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var c=a(this),e=c.data("bs.alert");e||c.data("bs.alert",e=new d(this)),"string"==typeof b&&e[b].call(c)})}var c='[data-dismiss="alert"]',d=function(b){a(b).on("click",c,this.close)};d.VERSION="3.3.7",d.TRANSITION_DURATION=150,d.prototype.close=function(b){function c(){g.detach().trigger("closed.bs.alert").remove()}var e=a(this),f=e.attr("data-target");f||(f=e.attr("href"),f=f&&f.replace(/.*(?=#[^\s]*$)/,""));var g=a("#"===f?[]:f);b&&b.preventDefault(),g.length||(g=e.closest(".alert")),g.trigger(b=a.Event("close.bs.alert")),b.isDefaultPrevented()||(g.removeClass("in"),a.support.transition&&g.hasClass("fade")?g.one("bsTransitionEnd",c).emulateTransitionEnd(d.TRANSITION_DURATION):c())};var e=a.fn.alert;a.fn.alert=b,a.fn.alert.Constructor=d,a.fn.alert.noConflict=function(){return a.fn.alert=e,this},a(document).on("click.bs.alert.data-api",c,d.prototype.close)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.button"),f="object"==typeof b&&b;e||d.data("bs.button",e=new c(this,f)),"toggle"==b?e.toggle():b&&e.setState(b)})}var c=function(b,d){this.$element=a(b),this.options=a.extend({},c.DEFAULTS,d),this.isLoading=!1};c.VERSION="3.3.7",c.DEFAULTS={loadingText:"loading..."},c.prototype.setState=function(b){var c="disabled",d=this.$element,e=d.is("input")?"val":"html",f=d.data();b+="Text",null==f.resetText&&d.data("resetText",d[e]()),setTimeout(a.proxy(function(){d[e](null==f[b]?this.options[b]:f[b]),"loadingText"==b?(this.isLoading=!0,d.addClass(c).attr(c,c).prop(c,!0)):this.isLoading&&(this.isLoading=!1,d.removeClass(c).removeAttr(c).prop(c,!1))},this),0)},c.prototype.toggle=function(){var a=!0,b=this.$element.closest('[data-toggle="buttons"]');if(b.length){var c=this.$element.find("input");"radio"==c.prop("type")?(c.prop("checked")&&(a=!1),b.find(".active").removeClass("active"),this.$element.addClass("active")):"checkbox"==c.prop("type")&&(c.prop("checked")!==this.$element.hasClass("active")&&(a=!1),this.$element.toggleClass("active")),c.prop("checked",this.$element.hasClass("active")),a&&c.trigger("change")}else this.$element.attr("aria-pressed",!this.$element.hasClass("active")),this.$element.toggleClass("active")};var d=a.fn.button;a.fn.button=b,a.fn.button.Constructor=c,a.fn.button.noConflict=function(){return a.fn.button=d,this},a(document).on("click.bs.button.data-api",'[data-toggle^="button"]',function(c){var d=a(c.target).closest(".btn");b.call(d,"toggle"),a(c.target).is('input[type="radio"], input[type="checkbox"]')||(c.preventDefault(),d.is("input,button")?d.trigger("focus"):d.find("input:visible,button:visible").first().trigger("focus"))}).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',function(b){a(b.target).closest(".btn").toggleClass("focus",/^focus(in)?$/.test(b.type))})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.carousel"),f=a.extend({},c.DEFAULTS,d.data(),"object"==typeof b&&b),g="string"==typeof b?b:f.slide;e||d.data("bs.carousel",e=new c(this,f)),"number"==typeof b?e.to(b):g?e[g]():f.interval&&e.pause().cycle()})}var c=function(b,c){this.$element=a(b),this.$indicators=this.$element.find(".carousel-indicators"),this.options=c,this.paused=null,this.sliding=null,this.interval=null,this.$active=null,this.$items=null,this.options.keyboard&&this.$element.on("keydown.bs.carousel",a.proxy(this.keydown,this)),"hover"==this.options.pause&&!("ontouchstart"in document.documentElement)&&this.$element.on("mouseenter.bs.carousel",a.proxy(this.pause,this)).on("mouseleave.bs.carousel",a.proxy(this.cycle,this))};c.VERSION="3.3.7",c.TRANSITION_DURATION=600,c.DEFAULTS={interval:5e3,pause:"hover",wrap:!0,keyboard:!0},c.prototype.keydown=function(a){if(!/input|textarea/i.test(a.target.tagName)){switch(a.which){case 37:this.prev();break;case 39:this.next();break;default:return}a.preventDefault()}},c.prototype.cycle=function(b){return b||(this.paused=!1),this.interval&&clearInterval(this.interval),this.options.interval&&!this.paused&&(this.interval=setInterval(a.proxy(this.next,this),this.options.interval)),this},c.prototype.getItemIndex=function(a){return this.$items=a.parent().children(".item"),this.$items.index(a||this.$active)},c.prototype.getItemForDirection=function(a,b){var c=this.getItemIndex(b),d="prev"==a&&0===c||"next"==a&&c==this.$items.length-1;if(d&&!this.options.wrap)return b;var e="prev"==a?-1:1,f=(c+e)%this.$items.length;return this.$items.eq(f)},c.prototype.to=function(a){var b=this,c=this.getItemIndex(this.$active=this.$element.find(".item.active"));if(!(a>this.$items.length-1||a<0))return this.sliding?this.$element.one("slid.bs.carousel",function(){b.to(a)}):c==a?this.pause().cycle():this.slide(a>c?"next":"prev",this.$items.eq(a))},c.prototype.pause=function(b){return b||(this.paused=!0),this.$element.find(".next, .prev").length&&a.support.transition&&(this.$element.trigger(a.support.transition.end),this.cycle(!0)),this.interval=clearInterval(this.interval),this},c.prototype.next=function(){if(!this.sliding)return this.slide("next")},c.prototype.prev=function(){if(!this.sliding)return this.slide("prev")},c.prototype.slide=function(b,d){var e=this.$element.find(".item.active"),f=d||this.getItemForDirection(b,e),g=this.interval,h="next"==b?"left":"right",i=this;if(f.hasClass("active"))return this.sliding=!1;var j=f[0],k=a.Event("slide.bs.carousel",{relatedTarget:j,direction:h});if(this.$element.trigger(k),!k.isDefaultPrevented()){if(this.sliding=!0,g&&this.pause(),this.$indicators.length){this.$indicators.find(".active").removeClass("active");var l=a(this.$indicators.children()[this.getItemIndex(f)]);l&&l.addClass("active")}var m=a.Event("slid.bs.carousel",{relatedTarget:j,direction:h});return a.support.transition&&this.$element.hasClass("slide")?(f.addClass(b),f[0].offsetWidth,e.addClass(h),f.addClass(h),e.one("bsTransitionEnd",function(){f.removeClass([b,h].join(" ")).addClass("active"),e.removeClass(["active",h].join(" ")),i.sliding=!1,setTimeout(function(){i.$element.trigger(m)},0)}).emulateTransitionEnd(c.TRANSITION_DURATION)):(e.removeClass("active"),f.addClass("active"),this.sliding=!1,this.$element.trigger(m)),g&&this.cycle(),this}};var d=a.fn.carousel;a.fn.carousel=b,a.fn.carousel.Constructor=c,a.fn.carousel.noConflict=function(){return a.fn.carousel=d,this};var e=function(c){var d,e=a(this),f=a(e.attr("data-target")||(d=e.attr("href"))&&d.replace(/.*(?=#[^\s]+$)/,""));if(f.hasClass("carousel")){var g=a.extend({},f.data(),e.data()),h=e.attr("data-slide-to");h&&(g.interval=!1),b.call(f,g),h&&f.data("bs.carousel").to(h),c.preventDefault()}};a(document).on("click.bs.carousel.data-api","[data-slide]",e).on("click.bs.carousel.data-api","[data-slide-to]",e),a(window).on("load",function(){a('[data-ride="carousel"]').each(function(){var c=a(this);b.call(c,c.data())})})}(jQuery),+function(a){"use strict";function b(b){var c,d=b.attr("data-target")||(c=b.attr("href"))&&c.replace(/.*(?=#[^\s]+$)/,"");return a(d)}function c(b){return this.each(function(){var c=a(this),e=c.data("bs.collapse"),f=a.extend({},d.DEFAULTS,c.data(),"object"==typeof b&&b);!e&&f.toggle&&/show|hide/.test(b)&&(f.toggle=!1),e||c.data("bs.collapse",e=new d(this,f)),"string"==typeof b&&e[b]()})}var d=function(b,c){this.$element=a(b),this.options=a.extend({},d.DEFAULTS,c),this.$trigger=a('[data-toggle="collapse"][href="#'+b.id+'"],[data-toggle="collapse"][data-target="#'+b.id+'"]'),this.transitioning=null,this.options.parent?this.$parent=this.getParent():this.addAriaAndCollapsedClass(this.$element,this.$trigger),this.options.toggle&&this.toggle()};d.VERSION="3.3.7",d.TRANSITION_DURATION=350,d.DEFAULTS={toggle:!0},d.prototype.dimension=function(){var a=this.$element.hasClass("width");return a?"width":"height"},d.prototype.show=function(){if(!this.transitioning&&!this.$element.hasClass("in")){var b,e=this.$parent&&this.$parent.children(".panel").children(".in, .collapsing");if(!(e&&e.length&&(b=e.data("bs.collapse"),b&&b.transitioning))){var f=a.Event("show.bs.collapse");if(this.$element.trigger(f),!f.isDefaultPrevented()){e&&e.length&&(c.call(e,"hide"),b||e.data("bs.collapse",null));var g=this.dimension();this.$element.removeClass("collapse").addClass("collapsing")[g](0).attr("aria-expanded",!0),this.$trigger.removeClass("collapsed").attr("aria-expanded",!0),this.transitioning=1;var h=function(){this.$element.removeClass("collapsing").addClass("collapse in")[g](""),this.transitioning=0,this.$element.trigger("shown.bs.collapse")};if(!a.support.transition)return h.call(this);var i=a.camelCase(["scroll",g].join("-"));this.$element.one("bsTransitionEnd",a.proxy(h,this)).emulateTransitionEnd(d.TRANSITION_DURATION)[g](this.$element[0][i])}}}},d.prototype.hide=function(){if(!this.transitioning&&this.$element.hasClass("in")){var b=a.Event("hide.bs.collapse");if(this.$element.trigger(b),!b.isDefaultPrevented()){var c=this.dimension();this.$element[c](this.$element[c]())[0].offsetHeight,this.$element.addClass("collapsing").removeClass("collapse in").attr("aria-expanded",!1),this.$trigger.addClass("collapsed").attr("aria-expanded",!1),this.transitioning=1;var e=function(){this.transitioning=0,this.$element.removeClass("collapsing").addClass("collapse").trigger("hidden.bs.collapse")};return a.support.transition?void this.$element[c](0).one("bsTransitionEnd",a.proxy(e,this)).emulateTransitionEnd(d.TRANSITION_DURATION):e.call(this)}}},d.prototype.toggle=function(){this[this.$element.hasClass("in")?"hide":"show"]()},d.prototype.getParent=function(){return a(this.options.parent).find('[data-toggle="collapse"][data-parent="'+this.options.parent+'"]').each(a.proxy(function(c,d){var e=a(d);this.addAriaAndCollapsedClass(b(e),e)},this)).end()},d.prototype.addAriaAndCollapsedClass=function(a,b){var c=a.hasClass("in");a.attr("aria-expanded",c),b.toggleClass("collapsed",!c).attr("aria-expanded",c)};var e=a.fn.collapse;a.fn.collapse=c,a.fn.collapse.Constructor=d,a.fn.collapse.noConflict=function(){return a.fn.collapse=e,this},a(document).on("click.bs.collapse.data-api",'[data-toggle="collapse"]',function(d){var e=a(this);e.attr("data-target")||d.preventDefault();var f=b(e),g=f.data("bs.collapse"),h=g?"toggle":e.data();c.call(f,h)})}(jQuery),+function(a){"use strict";function b(b){var c=b.attr("data-target");c||(c=b.attr("href"),c=c&&/#[A-Za-z]/.test(c)&&c.replace(/.*(?=#[^\s]*$)/,""));var d=c&&a(c);return d&&d.length?d:b.parent()}function c(c){c&&3===c.which||(a(e).remove(),a(f).each(function(){var d=a(this),e=b(d),f={relatedTarget:this};e.hasClass("open")&&(c&&"click"==c.type&&/input|textarea/i.test(c.target.tagName)&&a.contains(e[0],c.target)||(e.trigger(c=a.Event("hide.bs.dropdown",f)),c.isDefaultPrevented()||(d.attr("aria-expanded","false"),e.removeClass("open").trigger(a.Event("hidden.bs.dropdown",f)))))}))}function d(b){return this.each(function(){var c=a(this),d=c.data("bs.dropdown");d||c.data("bs.dropdown",d=new g(this)),"string"==typeof b&&d[b].call(c)})}var e=".dropdown-backdrop",f='[data-toggle="dropdown"]',g=function(b){a(b).on("click.bs.dropdown",this.toggle)};g.VERSION="3.3.7",g.prototype.toggle=function(d){var e=a(this);if(!e.is(".disabled, :disabled")){var f=b(e),g=f.hasClass("open");if(c(),!g){"ontouchstart"in document.documentElement&&!f.closest(".navbar-nav").length&&a(document.createElement("div")).addClass("dropdown-backdrop").insertAfter(a(this)).on("click",c);var h={relatedTarget:this};if(f.trigger(d=a.Event("show.bs.dropdown",h)),d.isDefaultPrevented())return;e.trigger("focus").attr("aria-expanded","true"),f.toggleClass("open").trigger(a.Event("shown.bs.dropdown",h))}return!1}},g.prototype.keydown=function(c){if(/(38|40|27|32)/.test(c.which)&&!/input|textarea/i.test(c.target.tagName)){var d=a(this);if(c.preventDefault(),c.stopPropagation(),!d.is(".disabled, :disabled")){var e=b(d),g=e.hasClass("open");if(!g&&27!=c.which||g&&27==c.which)return 27==c.which&&e.find(f).trigger("focus"),d.trigger("click");var h=" li:not(.disabled):visible a",i=e.find(".dropdown-menu"+h);if(i.length){var j=i.index(c.target);38==c.which&&j>0&&j--,40==c.which&&jdocument.documentElement.clientHeight;this.$element.css({paddingLeft:!this.bodyIsOverflowing&&a?this.scrollbarWidth:"",paddingRight:this.bodyIsOverflowing&&!a?this.scrollbarWidth:""})},c.prototype.resetAdjustments=function(){this.$element.css({paddingLeft:"",paddingRight:""})},c.prototype.checkScrollbar=function(){var a=window.innerWidth;if(!a){var b=document.documentElement.getBoundingClientRect();a=b.right-Math.abs(b.left)}this.bodyIsOverflowing=document.body.clientWidth
',trigger:"hover focus",title:"",delay:0,html:!1,container:!1,viewport:{selector:"body",padding:0}},c.prototype.init=function(b,c,d){if(this.enabled=!0,this.type=b,this.$element=a(c),this.options=this.getOptions(d),this.$viewport=this.options.viewport&&a(a.isFunction(this.options.viewport)?this.options.viewport.call(this,this.$element):this.options.viewport.selector||this.options.viewport),this.inState={click:!1,hover:!1,focus:!1},this.$element[0]instanceof document.constructor&&!this.options.selector)throw new Error("`selector` option must be specified when initializing "+this.type+" on the window.document object!");for(var e=this.options.trigger.split(" "),f=e.length;f--;){var g=e[f];if("click"==g)this.$element.on("click."+this.type,this.options.selector,a.proxy(this.toggle,this));else if("manual"!=g){var h="hover"==g?"mouseenter":"focusin",i="hover"==g?"mouseleave":"focusout";this.$element.on(h+"."+this.type,this.options.selector,a.proxy(this.enter,this)),this.$element.on(i+"."+this.type,this.options.selector,a.proxy(this.leave,this))}}this.options.selector?this._options=a.extend({},this.options,{trigger:"manual",selector:""}):this.fixTitle()},c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.getOptions=function(b){return b=a.extend({},this.getDefaults(),this.$element.data(),b),b.delay&&"number"==typeof b.delay&&(b.delay={show:b.delay,hide:b.delay}),b},c.prototype.getDelegateOptions=function(){var b={},c=this.getDefaults();return this._options&&a.each(this._options,function(a,d){c[a]!=d&&(b[a]=d)}),b},c.prototype.enter=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusin"==b.type?"focus":"hover"]=!0),c.tip().hasClass("in")||"in"==c.hoverState?void(c.hoverState="in"):(clearTimeout(c.timeout),c.hoverState="in",c.options.delay&&c.options.delay.show?void(c.timeout=setTimeout(function(){"in"==c.hoverState&&c.show()},c.options.delay.show)):c.show())},c.prototype.isInStateTrue=function(){for(var a in this.inState)if(this.inState[a])return!0;return!1},c.prototype.leave=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);if(c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusout"==b.type?"focus":"hover"]=!1),!c.isInStateTrue())return clearTimeout(c.timeout),c.hoverState="out",c.options.delay&&c.options.delay.hide?void(c.timeout=setTimeout(function(){"out"==c.hoverState&&c.hide()},c.options.delay.hide)):c.hide()},c.prototype.show=function(){var b=a.Event("show.bs."+this.type);if(this.hasContent()&&this.enabled){this.$element.trigger(b);var d=a.contains(this.$element[0].ownerDocument.documentElement,this.$element[0]);if(b.isDefaultPrevented()||!d)return;var e=this,f=this.tip(),g=this.getUID(this.type);this.setContent(),f.attr("id",g),this.$element.attr("aria-describedby",g),this.options.animation&&f.addClass("fade");var h="function"==typeof this.options.placement?this.options.placement.call(this,f[0],this.$element[0]):this.options.placement,i=/\s?auto?\s?/i,j=i.test(h);j&&(h=h.replace(i,"")||"top"),f.detach().css({top:0,left:0,display:"block"}).addClass(h).data("bs."+this.type,this),this.options.container?f.appendTo(this.options.container):f.insertAfter(this.$element),this.$element.trigger("inserted.bs."+this.type);var k=this.getPosition(),l=f[0].offsetWidth,m=f[0].offsetHeight;if(j){var n=h,o=this.getPosition(this.$viewport);h="bottom"==h&&k.bottom+m>o.bottom?"top":"top"==h&&k.top-m